{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "aa460049-640b-4961-981a-affc463c8a3b",
   "metadata": {},
   "source": [
    "# 1 准备工作"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3956f857-03ce-481c-bb42-abbe139311bd",
   "metadata": {},
   "source": [
    "## 1.1 安装相关包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "3fb17e0f-b216-45a1-874a-89b4aedbd88e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !apt update\n",
    "# !apt install unrar -y\n",
    "# !apt install mdbtools -y\n",
    "# !apt install default-jre -y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "acee0a2a-18b5-463a-adb4-d636acf3ea62",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !java --version"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "bb4535f7-9daf-474e-86a0-301054e782c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install pytz\n",
    "# !pip install rarfile\n",
    "# !pip install jaydebeapi\n",
    "# !pip install pyodbc\n",
    "# !pip install uuid\n",
    "# !pip install tenacity"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "78abd657-675f-4d8c-8251-52752fef53b5",
   "metadata": {},
   "source": [
    "## 1.2 加载包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "3a0df1bd-91b1-476d-bf6a-9c17983db382",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "import pytz\n",
    "from datetime import datetime\n",
    "import json, re, gc\n",
    "import glob\n",
    "import os, sys, rarfile\n",
    "from tqdm import tqdm\n",
    "# from typing import Dict, Any\n",
    "from pathlib import Path\n",
    "\n",
    "import platform\n",
    "import jaydebeapi\n",
    "import jpype\n",
    "import jpype.imports\n",
    "from jpype.types import *\n",
    "import subprocess\n",
    "import pyodbc, io\n",
    "# import difflib\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import math\n",
    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
    "import torch\n",
    "import pickle\n",
    "# from openai import OpenAI\n",
    "import warnings, logging\n",
    "sys.path.append('./')\n",
    "\n",
    "from utils.load_access_data import *\n",
    "from utils.file_unrar import *\n",
    "# from utils.data_mapping import *\n",
    "# from utils.data_mapping_36country3 import *\n",
    "# from utils.invoke_llm import *\n",
    "from utils.file_process import *\n",
    "from utils.bash_mapping import *\n",
    "from utils.logging_config import setup_logger\n",
    "\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "logger = setup_logger('Data Goverance')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "0d22cb7f-a935-4e82-bb54-a1cc22e6d6a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datetime import datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "1e70e112-297d-41ad-b94f-ef1c9bb884e8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2025-09-10 02:27:22,981 - Data Goverance - INFO - 2025-09-10 10:27:22.981187+08:00\n"
     ]
    }
   ],
   "source": [
    "# 计时\n",
    "tz_beijing = pytz.timezone('Asia/Shanghai')\n",
    "bj_now = datetime.now(tz_beijing)\n",
    "dt_now = datetime.strptime(datetime.strftime(bj_now, '%Y-%m-%d'), '%Y-%m-%d')\n",
    "logger.info(bj_now)\n",
    "\n",
    "all_start=time.time()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "14176eb0-f67a-4053-96f8-ff299df631ae",
   "metadata": {},
   "source": [
    "## 1.3 准备全局变量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "2baa7fe2-b85d-4299-b235-d672e338ab78",
   "metadata": {},
   "outputs": [],
   "source": [
    "class Config:\n",
    "    target_cols = ['channelno', 'dataid', 'iesign', 'datatype', 'writeoffflag'\n",
    "            , 'writeoffdataid', 'outputdate', 'origincountrycode', 'origincountry', 'countrycodeofdelivery'\n",
    "            , 'countryofdelivery', 'importername', 'importeraddress', 'importercontact', 'suppliername'\n",
    "            , 'supplieraddress', 'suppliercontact', 'hscode', 'hscodedescription', 'commoditydescription'\n",
    "            , 'totalcifvalue', 'totalfobvalue', 'grossweight', 'netweight', 'quantity'\n",
    "            , 'quantityunit', 'teu', 'importer_forwarderagent', 'supplier_forwarderagent', 'abnormaldata'\n",
    "            , 'portofloading', 'portofdestination', 'loadingcountrycode', 'loadingcountry', 'transportterm'\n",
    "            , 'tradeterm', 'paymentterm', 'carrier', 'containerno', 'vesselname'\n",
    "            , 'brand', 'version', 'country', 'IMPORTER_ID', 'SUPPLIER_ID', 'cif_currency', 'fob_currency']\n",
    "config = Config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "90881866-9cbb-4442-a5f6-7217e901ce3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 加载国家映射表\n",
    "country_code = pd.read_csv('param_files/国家代码表.csv')\n",
    "country_code_dict = country_code.loc[:,['英文国名','中文国名']].to_dict('records')\n",
    "country_code_dict_detail = country_code.to_dict('records')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "67fbe1a5-8a7b-4d95-ae58-ef5a97913cc9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 加载36个国家码\n",
    "data_country36_code = pd.read_csv('param_files/数据36国家代码表.csv')\n",
    "data_country36_dict = data_country36_code.to_dict('records')\n",
    "\n",
    "country36_dict = dict()\n",
    "for x in data_country36_dict:\n",
    "    country36_dict[x.get('中文国名')]={'英文国名': x.get('英文国名'),'国别编码':x.get('国别编码')}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "ff0099c1-7091-40f2-8fe0-00618ab32c97",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 处理的中间结果，\n",
    "model_result = pickle.load(open('param_files/model_rslt_05_20250810.pickle', 'rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "0158c76f-e0df-44d3-9235-12945187e89b",
   "metadata": {},
   "outputs": [],
   "source": [
    "country36_cols_dict = model_result['all_country_cols_dict']\n",
    "all_country_cols_dict = model_result['all_country_cols_dict']\n",
    "\n",
    "all_country_list = model_result['all_country_list']\n",
    "llm_country_describe = model_result['llm_country_describe']\n",
    "llm_country_mapping = model_result['llm_country_mapping']\n",
    "country_mapping = model_result['country_mapping']\n",
    "currency_df = model_result['currency_df']\n",
    "currency_on_usd = model_result['currency_on_usd']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "919d9b79-5ccd-4bc7-a1f6-704526b90632",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['all_country_cols_dict', 'all_country_list', 'llm_country_describe', 'llm_country_mapping', 'country_mapping', 'currency_df', 'currency_on_usd'])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model_result.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "eb6ed96d-7eef-45e7-acbb-4ffeeb3760ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "# len(model_result['country_mapping'])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bf6830bc-d070-4f45-a9de-13fceee9f39b",
   "metadata": {},
   "source": [
    "# 2 原始文件批量解压\n",
    "\n",
    "原始数据保存路径： dataset/data_20250724/2025年1-3月汇总\n",
    "\n",
    "批量解压至： dataset/data_20250724/20251t3_unrar”"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "4fe99ca6-b41b-40d2-981c-880874d3acee",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2025-09-10 02:27:23,060 - Data Goverance - INFO - number of rar_files:153, number of non_rar_files:1\n"
     ]
    }
   ],
   "source": [
    "# 定义要遍历的根目录\n",
    "root_dir = Path('dataset/data_20250724/2025年1-3月汇总')\n",
    "dir_paths, file_paths = get_dir_files(root_dir)\n",
    "\n",
    "# 获取需要解压的文件目录\n",
    "rar_file_paths = [x for x in file_paths if str(x).split('.')[-1]=='rar']\n",
    "nrar_file_paths = [x for x in file_paths if str(x).split('.')[-1]!='rar']\n",
    "logger.info(f'number of rar_files:{len(rar_file_paths)}, number of non_rar_files:{len(nrar_file_paths)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "90e89919-162e-4eda-b93f-e968e47a71d5",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# # 批量解压至“dataset/data_20250724/20251t3_unrar”\n",
    "# # 请谨慎执行，如果解压过请不要再运行\n",
    "# extract_rar_batch(file_paths, source_str='2025年1-3月汇总', replace_str='20251t3_unrar')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "e868c9d1-a325-4ffc-a0c7-baabdc443541",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# # 检查 检查\n",
    "# !apt update && apt install rar unrar\n",
    "# !unrar x dataset/data_20250724/20251t3_unrar/乌克兰/UA_IMPORT_202506_MDB.rar dataset/data_20250724/20251t3_unrar/乌克兰\n",
    "# !unrar x dataset/data_20250724/20251t3_unrar/乌克兰/UA_EXPORT_202506_MDB.rar dataset/data_20250724/20251t3_unrar/乌克兰\n",
    "# !rm -rf dataset/data_20250724/20251t3_unrar/乌克兰/UA_IMPORT_202506_MDB.rar\n",
    "# !rm -rf dataset/data_20250724/20251t3_unrar/乌克兰/UA_EXPORT_202506_MDB.rar\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9ca558fd-2b08-4340-847b-5c0cf25c3af4",
   "metadata": {},
   "source": [
    "# 3 解压的各种格式数据统一转为csv"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "20fdd3cb-ca1a-4c3e-af5e-de00d0aaa8f6",
   "metadata": {},
   "source": [
    "\n",
    "添加数据主键 dataid\n",
    "添加数据主键 country\n",
    "添加数据主键 iesign\n",
    "\n",
    "转换后数据路径为：dataset/data_20250724/20251t3_src"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "c80de384-6c66-4822-bbc9-fd04180d08c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "country_list = ['智利', '坦桑尼亚', '乌克兰']\n",
    "# country_list = ['智利']\n",
    "\n",
    "# 获取文件目录，区分出口、进口数据\n",
    "from_dir = Path('dataset/data_20250724/20251t3_unrar')\n",
    "to_dir = Path('dataset/data_20250724/20251t3_src')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "805beda6-f192-4982-8690-72e74c96cde3",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# # # 慎重跑这个结果，会全量抽取原始数据\n",
    "# all_country_cols_dict = get_source_csv(country_list, from_dir, to_dir, country36_dict)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "96353e48-3495-4199-ac9d-59939b21ed43",
   "metadata": {},
   "source": [
    "# 4 更正数据月份至version字段"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "2e90119f-c8fd-4702-a6ea-d72bfae0f723",
   "metadata": {},
   "outputs": [],
   "source": [
    "root_dir = Path('dataset/data_20250724/20251t3_src')\n",
    "dir_paths, file_paths = get_dir_files(root_dir)\n",
    "file_tree = distinct_ie_data(root_dir, country_list)\n",
    "\n",
    "save_dir = Path('dataset/data_20250724/20251t3_src_version')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "5882d3db-5198-4d42-8815-ec544887a908",
   "metadata": {},
   "outputs": [],
   "source": [
    "country_list = ['智利', '坦桑尼亚', '乌克兰']\n",
    "# country_list = ['智利']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "0a30c055-d35e-4ab2-85d6-c722bfd51e49",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0% 0/3 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2025-09-10 02:27:23,097 - File Process - INFO - 智利\n",
      "2025-09-10 02:27:23,098 - File Process - INFO - read data: dataset/data_20250724/20251t3_src/智利/import_paths/202501-IMP.csv\n",
      "202501\n",
      "2025-09-10 02:27:30,453 - File Process - INFO - data: (383012, 47)\n",
      "2025-09-10 02:27:30,453 - File Process - INFO - new data save: dataset/data_20250724/20251t3_src_version/智利/import_paths/202501-IMP.csv\n",
      "2025-09-10 02:27:30,454 - File Process - INFO - read data: dataset/data_20250724/20251t3_src/智利/import_paths/202502-IMP.csv\n",
      "202502\n",
      "2025-09-10 02:27:36,923 - File Process - INFO - data: (354974, 47)\n",
      "2025-09-10 02:27:36,925 - File Process - INFO - new data save: dataset/data_20250724/20251t3_src_version/智利/import_paths/202502-IMP.csv\n",
      "2025-09-10 02:27:36,927 - File Process - INFO - read data: dataset/data_20250724/20251t3_src/智利/import_paths/202503-IMP.csv\n",
      "202503\n",
      "2025-09-10 02:27:44,727 - File Process - INFO - data: (385873, 47)\n",
      "2025-09-10 02:27:44,728 - File Process - INFO - new data save: dataset/data_20250724/20251t3_src_version/智利/import_paths/202503-IMP.csv\n",
      "2025-09-10 02:27:44,729 - File Process - INFO - read data: dataset/data_20250724/20251t3_src/智利/export_paths/202502-EXP.csv\n",
      "202502\n",
      "2025-09-10 02:27:46,470 - File Process - INFO - data: (98419, 46)\n",
      "2025-09-10 02:27:46,472 - File Process - INFO - new data save: dataset/data_20250724/20251t3_src_version/智利/export_paths/202502-EXP.csv\n",
      "2025-09-10 02:27:46,473 - File Process - INFO - read data: dataset/data_20250724/20251t3_src/智利/export_paths/202501-EXP.csv\n",
      "202501\n",
      "2025-09-10 02:27:48,648 - File Process - INFO - data: (127496, 46)\n",
      "2025-09-10 02:27:48,648 - File Process - INFO - new data save: dataset/data_20250724/20251t3_src_version/智利/export_paths/202501-EXP.csv\n",
      "2025-09-10 02:27:48,649 - File Process - INFO - read data: dataset/data_20250724/20251t3_src/智利/export_paths/202503-EXP.csv\n",
      "202503\n",
      "2025-09-10 02:27:50,388 - File Process - INFO - data: (100956, 46)\n",
      "2025-09-10 02:27:50,390 - File Process - INFO - new data save: dataset/data_20250724/20251t3_src_version/智利/export_paths/202503-EXP.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 33% 1/3 [00:27<00:54, 27.29s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2025-09-10 02:27:50,393 - File Process - INFO - 坦桑尼亚\n",
      "2025-09-10 02:27:50,395 - File Process - INFO - read data: dataset/data_20250724/20251t3_src/坦桑尼亚/import_paths/202501-IMP-RAW.csv\n",
      "202501\n",
      "2025-09-10 02:28:00,549 - File Process - INFO - data: (432559, 74)\n",
      "2025-09-10 02:28:00,551 - File Process - INFO - new data save: dataset/data_20250724/20251t3_src_version/坦桑尼亚/import_paths/202501-IMP-RAW.csv\n",
      "2025-09-10 02:28:00,552 - File Process - INFO - read data: dataset/data_20250724/20251t3_src/坦桑尼亚/import_paths/202503-IMP-RAW.csv\n",
      "202503\n",
      "2025-09-10 02:28:08,767 - File Process - INFO - data: (345735, 74)\n",
      "2025-09-10 02:28:08,768 - File Process - INFO - new data save: dataset/data_20250724/20251t3_src_version/坦桑尼亚/import_paths/202503-IMP-RAW.csv\n",
      "2025-09-10 02:28:08,768 - File Process - INFO - read data: dataset/data_20250724/20251t3_src/坦桑尼亚/import_paths/202502-IMP-RAW.csv\n",
      "202502\n",
      "2025-09-10 02:28:17,307 - File Process - INFO - data: (359832, 74)\n",
      "2025-09-10 02:28:17,309 - File Process - INFO - new data save: dataset/data_20250724/20251t3_src_version/坦桑尼亚/import_paths/202502-IMP-RAW.csv\n",
      "2025-09-10 02:28:17,311 - File Process - INFO - read data: dataset/data_20250724/20251t3_src/坦桑尼亚/export_paths/202501-EXP-RAW.csv\n",
      "202501\n",
      "2025-09-10 02:28:17,702 - File Process - INFO - data: (9990, 74)\n",
      "2025-09-10 02:28:17,702 - File Process - INFO - new data save: dataset/data_20250724/20251t3_src_version/坦桑尼亚/export_paths/202501-EXP-RAW.csv\n",
      "2025-09-10 02:28:17,703 - File Process - INFO - read data: dataset/data_20250724/20251t3_src/坦桑尼亚/export_paths/202503-EXP-RAW.csv\n",
      "202503\n",
      "2025-09-10 02:28:17,934 - File Process - INFO - data: (7761, 74)\n",
      "2025-09-10 02:28:17,935 - File Process - INFO - new data save: dataset/data_20250724/20251t3_src_version/坦桑尼亚/export_paths/202503-EXP-RAW.csv\n",
      "2025-09-10 02:28:17,935 - File Process - INFO - read data: dataset/data_20250724/20251t3_src/坦桑尼亚/export_paths/202503-TRANS-RAW.csv\n",
      "202503\n",
      "2025-09-10 02:28:20,990 - File Process - INFO - data: (133800, 74)\n",
      "2025-09-10 02:28:20,992 - File Process - INFO - new data save: dataset/data_20250724/20251t3_src_version/坦桑尼亚/export_paths/202503-TRANS-RAW.csv\n",
      "2025-09-10 02:28:20,994 - File Process - INFO - read data: dataset/data_20250724/20251t3_src/坦桑尼亚/export_paths/202502-TRANS-RAW.csv\n",
      "202502\n",
      "2025-09-10 02:28:23,999 - File Process - INFO - data: (129629, 74)\n",
      "2025-09-10 02:28:24,001 - File Process - INFO - new data save: dataset/data_20250724/20251t3_src_version/坦桑尼亚/export_paths/202502-TRANS-RAW.csv\n",
      "2025-09-10 02:28:24,003 - File Process - INFO - read data: dataset/data_20250724/20251t3_src/坦桑尼亚/export_paths/202501-TRANS-RAW.csv\n",
      "202501\n",
      "2025-09-10 02:28:27,536 - File Process - INFO - data: (152034, 74)\n",
      "2025-09-10 02:28:27,536 - File Process - INFO - new data save: dataset/data_20250724/20251t3_src_version/坦桑尼亚/export_paths/202501-TRANS-RAW.csv\n",
      "2025-09-10 02:28:27,537 - File Process - INFO - read data: dataset/data_20250724/20251t3_src/坦桑尼亚/export_paths/202502-EXP-RAW.csv\n",
      "202502\n",
      "2025-09-10 02:28:27,815 - File Process - INFO - data: (7660, 74)\n",
      "2025-09-10 02:28:27,815 - File Process - INFO - new data save: dataset/data_20250724/20251t3_src_version/坦桑尼亚/export_paths/202502-EXP-RAW.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 67% 2/3 [01:04<00:33, 33.25s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2025-09-10 02:28:27,816 - File Process - INFO - 乌克兰\n",
      "2025-09-10 02:28:27,816 - File Process - INFO - read data: dataset/data_20250724/20251t3_src/乌克兰/import_paths/UA_IMPORT_202501.csv\n",
      "202501\n",
      "2025-09-10 02:28:49,814 - File Process - INFO - data: (652325, 33)\n",
      "2025-09-10 02:28:49,816 - File Process - INFO - new data save: dataset/data_20250724/20251t3_src_version/乌克兰/import_paths/UA_IMPORT_202501.csv\n",
      "2025-09-10 02:28:49,818 - File Process - INFO - read data: dataset/data_20250724/20251t3_src/乌克兰/import_paths/UA_IMPORT_202503.csv\n",
      "202503\n",
      "2025-09-10 02:29:08,548 - File Process - INFO - data: (771565, 33)\n",
      "2025-09-10 02:29:08,549 - File Process - INFO - new data save: dataset/data_20250724/20251t3_src_version/乌克兰/import_paths/UA_IMPORT_202503.csv\n",
      "2025-09-10 02:29:08,552 - File Process - INFO - read data: dataset/data_20250724/20251t3_src/乌克兰/import_paths/UA_IMPORT_202502.csv\n",
      "202502\n",
      "2025-09-10 02:29:25,283 - File Process - INFO - data: (675826, 33)\n",
      "2025-09-10 02:29:25,285 - File Process - INFO - new data save: dataset/data_20250724/20251t3_src_version/乌克兰/import_paths/UA_IMPORT_202502.csv\n",
      "2025-09-10 02:29:25,287 - File Process - INFO - read data: dataset/data_20250724/20251t3_src/乌克兰/export_paths/UA_EXPORT_202503.csv\n",
      "202503\n",
      "2025-09-10 02:29:29,116 - File Process - INFO - data: (141342, 33)\n",
      "2025-09-10 02:29:29,118 - File Process - INFO - new data save: dataset/data_20250724/20251t3_src_version/乌克兰/export_paths/UA_EXPORT_202503.csv\n",
      "2025-09-10 02:29:29,120 - File Process - INFO - read data: dataset/data_20250724/20251t3_src/乌克兰/export_paths/UA_EXPORT_202502.csv\n",
      "202502\n",
      "2025-09-10 02:29:32,537 - File Process - INFO - data: (130121, 33)\n",
      "2025-09-10 02:29:32,539 - File Process - INFO - new data save: dataset/data_20250724/20251t3_src_version/乌克兰/export_paths/UA_EXPORT_202502.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100% 3/3 [02:09<00:00, 43.15s/it]\n"
     ]
    }
   ],
   "source": [
    "get_data_month2(root_dir, save_dir, country_list, country36_cols_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d25bd35-e78d-4277-a748-19468983a494",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "878b0cc6-58ac-4ba3-a3b0-85949f5e4b6d",
   "metadata": {},
   "source": [
    "# 5 映射函数实现"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4530484a-9a00-471f-a938-f4533cbdd9c6",
   "metadata": {},
   "source": [
    "## 5.2 出口、进口批量跑映射函数"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3f11e091-6559-4b9f-a0f0-6c97d75c3f20",
   "metadata": {},
   "source": [
    "### 5.2.1 出口跑批"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "bfcbf49a-bf8e-4945-b23b-d0d5647b94c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 获取文件目录，区分出口、进口数据\n",
    "root_dir = Path('dataset/data_20250724/20251t3_src_version')\n",
    "\n",
    "# csv结果文件保存路径\n",
    "csv_files_path = Path('dataset/data_20250724/20251t3_csv_save2')\n",
    "\n",
    "country_list = ['智利', '坦桑尼亚', '乌克兰']\n",
    "# country_list = ['智利']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "6a365e16-9c5b-4938-9729-bf4a71422aa6",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2025-09-10 02:29:32,634 - Bash Mapping - INFO - 坦桑尼亚\n",
      "2025-09-10 02:29:32,635 - Bash Mapping - INFO - dataset/data_20250724/20251t3_src_version/坦桑尼亚/export_paths/202501-EXP-RAW.csv\n",
      "2025-09-10 02:29:32,636 - Bash Mapping - INFO - 智利\n",
      "2025-09-10 02:29:32,637 - Bash Mapping - INFO - dataset/data_20250724/20251t3_src_version/智利/export_paths/202502-EXP.csv\n",
      "2025-09-10 02:29:32,637 - Bash Mapping - INFO - 乌克兰\n",
      "2025-09-10 02:29:32,638 - Bash Mapping - INFO - dataset/data_20250724/20251t3_src_version/乌克兰/export_paths/UA_EXPORT_202503.csv\n",
      "2025-09-10 02:29:32,757 - Data Mapping - INFO - imp_df columns: ['CUSTOMS_OFFICE_CODE' 'TANSAD_YY' 'TANSAD_DATE' 'TANSAD_SERIAL_NO'\n",
      " 'TANSAD_FULL_NO' 'REGIME_NAME' 'TANSAD_YYYYMM' 'RELEASE_YYYYMM'\n",
      " 'PROCESSING_STATUS_NAME' 'ORIGIN_COUNTRY' 'CONSIGNMENT_COUNTRY'\n",
      " 'TRADING_COUNTRY' 'EXPORT_COUNTRY' 'DEST_COUNTRY' 'CRN'\n",
      " 'MANIFEST_SUBMISSION_DATE' 'BL_NO' 'IS_CONTAINERIZED' 'NO_OF_CONTAINERS'\n",
      " 'VESSEL_NAME' 'TRANSPORT_MEANS_ID' 'TRANSPORT_NATIONALITY_CD'\n",
      " 'ARRIVAL_DATE' 'DISCHARGE_DATE' 'ENTRY_OFFICE_CODE'\n",
      " 'SELECTIVITY_RESULT_CD' 'CLEARANCE_PLAN_CD' 'TRANSPORT_MODE_CODE'\n",
      " 'TRANSPORT_MODE_NAME' 'IMPORTER_TAX_ID' 'IMPORTER_NAME' 'EXPORTER_TAX_ID'\n",
      " 'EXPORTER_NAME' 'DECLARANT_TAX_ID' 'DECLARANT_NAME'\n",
      " 'ITEM_INVOICE_CARGO_VALUE_FOREIGN_CURRENCY' 'FOREIGN_CURRENCY'\n",
      " 'EXCHANGE_RATE' 'PREFERENCE_CODE' 'ITEM_NO' 'EPC_CODE' 'APC_CODE'\n",
      " 'HS_CODE' 'HS_CODE_DESC' 'PRODUCT_DESC' 'USED_YN' 'QUANTITY'\n",
      " 'UNIT_OF_MEASURE' 'TOTAL_FREIGHT_AMOUNT' 'ITEM_G_WEIGHT' 'ITEM_N_WEIGHT'\n",
      " 'ITEM_CARGO_VALUE_STATISTICS' 'ITEM_CUSTOMS_CARGO_VALUE' 'IMPORT_DUTY'\n",
      " 'IMPORT_DUTY_EXEMPTED' 'EXPORT_DUTY' 'EXPORT_DUTY_EXEMPTED' 'EXCISE_DUTY'\n",
      " 'EXCISE_DUTY_EXEMPTED' 'VAT' 'VAT_EXEMPTED' 'FUEL_LEVY' 'ELV' 'EXP'\n",
      " 'VEH_REG' 'RAILWAY_DEV_LEVY' 'CUSTOMS_PROCESS_FEE' 'DECLARED_CUSTOMS_VAL'\n",
      " 'ASSESSED_CUSTOMS_VAL' 'ITEM_TAX_AMOUNT' 'dataid' 'src_country' 'iesign'\n",
      " 'version']\n",
      "2025-09-10 02:29:32,845 - Data Mapping - INFO - Export data mapping and cleaning takes 0.0 minutes\n",
      "2025-09-10 02:29:32,895 - Bash Mapping - INFO - target_tab shape: (9990, 47)\n",
      "2025-09-10 02:29:33,157 - File Process - INFO - CSV data saved to dataset/data_20250724/20251t3_csv_save2/坦桑尼亚/EXPORT/202501-EXP-RAW.csv\n",
      "2025-09-10 02:29:33,231 - Bash Mapping - INFO - dataset/data_20250724/20251t3_src_version/坦桑尼亚/export_paths/202503-EXP-RAW.csv\n",
      "2025-09-10 02:29:33,411 - Data Mapping - INFO - imp_df columns: ['CUSTOMS_OFFICE_CODE' 'TANSAD_YY' 'TANSAD_DATE' 'TANSAD_SERIAL_NO'\n",
      " 'TANSAD_FULL_NO' 'REGIME_NAME' 'TANSAD_YYYYMM' 'RELEASE_YYYYMM'\n",
      " 'PROCESSING_STATUS_NAME' 'ORIGIN_COUNTRY' 'CONSIGNMENT_COUNTRY'\n",
      " 'TRADING_COUNTRY' 'EXPORT_COUNTRY' 'DEST_COUNTRY' 'CRN'\n",
      " 'MANIFEST_SUBMISSION_DATE' 'BL_NO' 'IS_CONTAINERIZED' 'NO_OF_CONTAINERS'\n",
      " 'VESSEL_NAME' 'TRANSPORT_MEANS_ID' 'TRANSPORT_NATIONALITY_CD'\n",
      " 'ARRIVAL_DATE' 'DISCHARGE_DATE' 'ENTRY_OFFICE_CODE'\n",
      " 'SELECTIVITY_RESULT_CD' 'CLEARANCE_PLAN_CD' 'TRANSPORT_MODE_CODE'\n",
      " 'TRANSPORT_MODE_NAME' 'IMPORTER_TAX_ID' 'IMPORTER_NAME' 'EXPORTER_TAX_ID'\n",
      " 'EXPORTER_NAME' 'DECLARANT_TAX_ID' 'DECLARANT_NAME'\n",
      " 'ITEM_INVOICE_CARGO_VALUE_FOREIGN_CURRENCY' 'FOREIGN_CURRENCY'\n",
      " 'EXCHANGE_RATE' 'PREFERENCE_CODE' 'ITEM_NO' 'EPC_CODE' 'APC_CODE'\n",
      " 'HS_CODE' 'HS_CODE_DESC' 'PRODUCT_DESC' 'USED_YN' 'QUANTITY'\n",
      " 'UNIT_OF_MEASURE' 'TOTAL_FREIGHT_AMOUNT' 'ITEM_G_WEIGHT' 'ITEM_N_WEIGHT'\n",
      " 'ITEM_CARGO_VALUE_STATISTICS' 'ITEM_CUSTOMS_CARGO_VALUE' 'IMPORT_DUTY'\n",
      " 'IMPORT_DUTY_EXEMPTED' 'EXPORT_DUTY' 'EXPORT_DUTY_EXEMPTED' 'EXCISE_DUTY'\n",
      " 'EXCISE_DUTY_EXEMPTED' 'VAT' 'VAT_EXEMPTED' 'FUEL_LEVY' 'ELV' 'EXP'\n",
      " 'VEH_REG' 'RAILWAY_DEV_LEVY' 'CUSTOMS_PROCESS_FEE' 'DECLARED_CUSTOMS_VAL'\n",
      " 'ASSESSED_CUSTOMS_VAL' 'ITEM_TAX_AMOUNT' 'dataid' 'src_country' 'iesign'\n",
      " 'version']\n",
      "2025-09-10 02:29:33,496 - Data Mapping - INFO - imp_df columns: ['DAY' 'MONTH' 'YEAR' 'CONTROL_ID' 'CUSTOMS' 'EXPORTER_ID' 'VERIFIER'\n",
      " 'EXPORTER' 'HS_CODE' 'HS_CODE_DESC' 'PRODUCT' 'VARIETEY' 'BRAND' 'DESC'\n",
      " 'DEST_COUNTRY' 'TRANS_TYPE' 'TRANS_CORP' 'SHIP_NAME' 'TYPE_OF_LOAD'\n",
      " 'ORIGIN_PORT' 'DEST_PORT' 'G_WEIGHT' 'QUANTITY' 'UNIT_OF_QUANTITY' 'FOB'\n",
      " 'FREIGHT' 'INSURANCE' 'CIF' 'FOB_UNIT' 'TYPE_OF_PACKAGE'\n",
      " 'EXPORTER_REGION' 'PACKAGES' 'PACKAGES_DESC' 'TRANS_CORP_COUNTRY'\n",
      " 'SALE_CONDITION' 'ECONOMIC_ZONE' 'EXPORTER_ECONOMIC_KEY'\n",
      " 'TRANS_DOC_NUMBER' 'TRANS_DOC_DATE' 'VOYAGE_NUMBER' 'INCOTERMS' 'PAYMENT'\n",
      " 'dataid' 'src_country' 'iesign' 'version']\n",
      "2025-09-10 02:29:33,607 - Data Mapping - INFO - Export data mapping and cleaning takes 0.01 minutes\n",
      "2025-09-10 02:29:33,741 - Bash Mapping - INFO - target_tab shape: (7761, 47)\n",
      "2025-09-10 02:29:33,769 - Data Mapping - INFO - Export data mapping and cleaning takes 0.02 minutes\n",
      "2025-09-10 02:29:33,923 - Bash Mapping - INFO - target_tab shape: (98419, 47)\n",
      "2025-09-10 02:29:34,075 - File Process - INFO - CSV data saved to dataset/data_20250724/20251t3_csv_save2/坦桑尼亚/EXPORT/202503-EXP-RAW.csv\n",
      "2025-09-10 02:29:34,158 - Bash Mapping - INFO - dataset/data_20250724/20251t3_src_version/坦桑尼亚/export_paths/202503-TRANS-RAW.csv\n",
      "2025-09-10 02:29:34,838 - Data Mapping - INFO - imp_df columns: ['REG_DATE' 'CUSTOMS_OFFICE_CODE' 'DECLARATION_NUMBER' 'ITEM_NO' 'HS_CODE'\n",
      " 'PRODUCT_DESC' 'CUSTOMS_OFFICE_NAME_3WEI' 'SHIPPER_CODE' 'SHIPPER_NAME'\n",
      " 'GUARANTEE_CODE' 'GUARANTEE_NAME_ADDRESS' 'DEPARTURE_COUNTRY'\n",
      " 'TRADING_COUNTRY' 'DEST_COUNTRY' 'CONSIGNEE_NAME_ADDRESS'\n",
      " 'CONTRACT_HOLDER_NAME' 'PLACE_OF_RECEIPT' 'INCOTERMS'\n",
      " 'TRANS_TYPE_AT_BORDER' 'TRANS_TYPE_WITHIN_COUNTRY' 'UOM' 'QTY'\n",
      " 'N_WEIGHT_IN_KG' 'INVOICE_VALUE_USD' 'CUSTOMS_VALUE_USD'\n",
      " 'DECLARATION_TYPE_CODE' 'DIRECTION' 'PREV_CUSTOMS_MODE_CODE'\n",
      " 'CUSTOMS_MODE' 'dataid' 'src_country' 'iesign' 'version']\n",
      "2025-09-10 02:29:35,555 - Data Mapping - INFO - Export data mapping and cleaning takes 0.05 minutes\n",
      "2025-09-10 02:29:35,697 - Bash Mapping - INFO - target_tab shape: (141342, 47)\n",
      "2025-09-10 02:29:36,478 - Data Mapping - INFO - imp_df columns: ['CUSTOMS_OFFICE_CODE' 'TANSAD_YY' 'TANSAD_DATE' 'TANSAD_SERIAL_NO'\n",
      " 'TANSAD_FULL_NO' 'REGIME_NAME' 'TANSAD_YYYYMM' 'RELEASE_YYYYMM'\n",
      " 'PROCESSING_STATUS_NAME' 'ORIGIN_COUNTRY' 'CONSIGNMENT_COUNTRY'\n",
      " 'TRADING_COUNTRY' 'EXPORT_COUNTRY' 'DEST_COUNTRY' 'CRN'\n",
      " 'MANIFEST_SUBMISSION_DATE' 'BL_NO' 'IS_CONTAINERIZED' 'NO_OF_CONTAINERS'\n",
      " 'VESSEL_NAME' 'TRANSPORT_MEANS_ID' 'TRANSPORT_NATIONALITY_CD'\n",
      " 'ARRIVAL_DATE' 'DISCHARGE_DATE' 'ENTRY_OFFICE_CODE'\n",
      " 'SELECTIVITY_RESULT_CD' 'CLEARANCE_PLAN_CD' 'TRANSPORT_MODE_CODE'\n",
      " 'TRANSPORT_MODE_NAME' 'IMPORTER_TAX_ID' 'IMPORTER_NAME' 'EXPORTER_TAX_ID'\n",
      " 'EXPORTER_NAME' 'DECLARANT_TAX_ID' 'DECLARANT_NAME'\n",
      " 'ITEM_INVOICE_CARGO_VALUE_FOREIGN_CURRENCY' 'FOREIGN_CURRENCY'\n",
      " 'EXCHANGE_RATE' 'PREFERENCE_CODE' 'ITEM_NO' 'EPC_CODE' 'APC_CODE'\n",
      " 'HS_CODE' 'HS_CODE_DESC' 'PRODUCT_DESC' 'USED_YN' 'QUANTITY'\n",
      " 'UNIT_OF_MEASURE' 'TOTAL_FREIGHT_AMOUNT' 'ITEM_G_WEIGHT' 'ITEM_N_WEIGHT'\n",
      " 'ITEM_CARGO_VALUE_STATISTICS' 'ITEM_CUSTOMS_CARGO_VALUE' 'IMPORT_DUTY'\n",
      " 'IMPORT_DUTY_EXEMPTED' 'EXPORT_DUTY' 'EXPORT_DUTY_EXEMPTED' 'EXCISE_DUTY'\n",
      " 'EXCISE_DUTY_EXEMPTED' 'VAT' 'VAT_EXEMPTED' 'FUEL_LEVY' 'ELV' 'EXP'\n",
      " 'VEH_REG' 'RAILWAY_DEV_LEVY' 'CUSTOMS_PROCESS_FEE' 'DECLARED_CUSTOMS_VAL'\n",
      " 'ASSESSED_CUSTOMS_VAL' 'ITEM_TAX_AMOUNT' 'dataid' 'src_country' 'iesign'\n",
      " 'version']\n",
      "2025-09-10 02:29:36,819 - Data Mapping - INFO - Export data mapping and cleaning takes 0.04 minutes\n",
      "2025-09-10 02:29:36,953 - Bash Mapping - INFO - target_tab shape: (133800, 47)\n",
      "2025-09-10 02:29:37,518 - File Process - INFO - CSV data saved to dataset/data_20250724/20251t3_csv_save2/智利/EXPORT/202502-EXP.csv\n",
      "2025-09-10 02:29:37,650 - Bash Mapping - INFO - dataset/data_20250724/20251t3_src_version/智利/export_paths/202501-EXP.csv\n",
      "2025-09-10 02:29:38,470 - Data Mapping - INFO - imp_df columns: ['DAY' 'MONTH' 'YEAR' 'CONTROL_ID' 'CUSTOMS' 'EXPORTER_ID' 'VERIFIER'\n",
      " 'EXPORTER' 'HS_CODE' 'HS_CODE_DESC' 'PRODUCT' 'VARIETEY' 'BRAND' 'DESC'\n",
      " 'DEST_COUNTRY' 'TRANS_TYPE' 'TRANS_CORP' 'SHIP_NAME' 'TYPE_OF_LOAD'\n",
      " 'ORIGIN_PORT' 'DEST_PORT' 'G_WEIGHT' 'QUANTITY' 'UNIT_OF_QUANTITY' 'FOB'\n",
      " 'FREIGHT' 'INSURANCE' 'CIF' 'FOB_UNIT' 'TYPE_OF_PACKAGE'\n",
      " 'EXPORTER_REGION' 'PACKAGES' 'PACKAGES_DESC' 'TRANS_CORP_COUNTRY'\n",
      " 'SALE_CONDITION' 'ECONOMIC_ZONE' 'EXPORTER_ECONOMIC_KEY'\n",
      " 'TRANS_DOC_NUMBER' 'TRANS_DOC_DATE' 'VOYAGE_NUMBER' 'INCOTERMS' 'PAYMENT'\n",
      " 'dataid' 'src_country' 'iesign' 'version']\n",
      "2025-09-10 02:29:38,830 - Data Mapping - INFO - Export data mapping and cleaning takes 0.02 minutes\n",
      "2025-09-10 02:29:38,943 - Bash Mapping - INFO - target_tab shape: (127496, 47)\n",
      "2025-09-10 02:29:40,902 - File Process - INFO - CSV data saved to dataset/data_20250724/20251t3_csv_save2/坦桑尼亚/EXPORT/202503-TRANS-RAW.csv\n",
      "2025-09-10 02:29:40,987 - Bash Mapping - INFO - dataset/data_20250724/20251t3_src_version/坦桑尼亚/export_paths/202502-TRANS-RAW.csv\n",
      "2025-09-10 02:29:41,508 - File Process - INFO - CSV data saved to dataset/data_20250724/20251t3_csv_save2/智利/EXPORT/202501-EXP.csv\n",
      "2025-09-10 02:29:41,593 - Bash Mapping - INFO - dataset/data_20250724/20251t3_src_version/智利/export_paths/202503-EXP.csv\n",
      "2025-09-10 02:29:42,401 - Data Mapping - INFO - imp_df columns: ['DAY' 'MONTH' 'YEAR' 'CONTROL_ID' 'CUSTOMS' 'EXPORTER_ID' 'VERIFIER'\n",
      " 'EXPORTER' 'HS_CODE' 'HS_CODE_DESC' 'PRODUCT' 'VARIETEY' 'BRAND' 'DESC'\n",
      " 'DEST_COUNTRY' 'TRANS_TYPE' 'TRANS_CORP' 'SHIP_NAME' 'TYPE_OF_LOAD'\n",
      " 'ORIGIN_PORT' 'DEST_PORT' 'G_WEIGHT' 'QUANTITY' 'UNIT_OF_QUANTITY' 'FOB'\n",
      " 'FREIGHT' 'INSURANCE' 'CIF' 'FOB_UNIT' 'TYPE_OF_PACKAGE'\n",
      " 'EXPORTER_REGION' 'PACKAGES' 'PACKAGES_DESC' 'TRANS_CORP_COUNTRY'\n",
      " 'SALE_CONDITION' 'ECONOMIC_ZONE' 'EXPORTER_ECONOMIC_KEY'\n",
      " 'TRANS_DOC_NUMBER' 'TRANS_DOC_DATE' 'VOYAGE_NUMBER' 'INCOTERMS' 'PAYMENT'\n",
      " 'dataid' 'src_country' 'iesign' 'version']\n",
      "2025-09-10 02:29:42,635 - Data Mapping - INFO - imp_df columns: ['CUSTOMS_OFFICE_CODE' 'TANSAD_YY' 'TANSAD_DATE' 'TANSAD_SERIAL_NO'\n",
      " 'TANSAD_FULL_NO' 'REGIME_NAME' 'TANSAD_YYYYMM' 'RELEASE_YYYYMM'\n",
      " 'PROCESSING_STATUS_NAME' 'ORIGIN_COUNTRY' 'CONSIGNMENT_COUNTRY'\n",
      " 'TRADING_COUNTRY' 'EXPORT_COUNTRY' 'DEST_COUNTRY' 'CRN'\n",
      " 'MANIFEST_SUBMISSION_DATE' 'BL_NO' 'IS_CONTAINERIZED' 'NO_OF_CONTAINERS'\n",
      " 'VESSEL_NAME' 'TRANSPORT_MEANS_ID' 'TRANSPORT_NATIONALITY_CD'\n",
      " 'ARRIVAL_DATE' 'DISCHARGE_DATE' 'ENTRY_OFFICE_CODE'\n",
      " 'SELECTIVITY_RESULT_CD' 'CLEARANCE_PLAN_CD' 'TRANSPORT_MODE_CODE'\n",
      " 'TRANSPORT_MODE_NAME' 'IMPORTER_TAX_ID' 'IMPORTER_NAME' 'EXPORTER_TAX_ID'\n",
      " 'EXPORTER_NAME' 'DECLARANT_TAX_ID' 'DECLARANT_NAME'\n",
      " 'ITEM_INVOICE_CARGO_VALUE_FOREIGN_CURRENCY' 'FOREIGN_CURRENCY'\n",
      " 'EXCHANGE_RATE' 'PREFERENCE_CODE' 'ITEM_NO' 'EPC_CODE' 'APC_CODE'\n",
      " 'HS_CODE' 'HS_CODE_DESC' 'PRODUCT_DESC' 'USED_YN' 'QUANTITY'\n",
      " 'UNIT_OF_MEASURE' 'TOTAL_FREIGHT_AMOUNT' 'ITEM_G_WEIGHT' 'ITEM_N_WEIGHT'\n",
      " 'ITEM_CARGO_VALUE_STATISTICS' 'ITEM_CUSTOMS_CARGO_VALUE' 'IMPORT_DUTY'\n",
      " 'IMPORT_DUTY_EXEMPTED' 'EXPORT_DUTY' 'EXPORT_DUTY_EXEMPTED' 'EXCISE_DUTY'\n",
      " 'EXCISE_DUTY_EXEMPTED' 'VAT' 'VAT_EXEMPTED' 'FUEL_LEVY' 'ELV' 'EXP'\n",
      " 'VEH_REG' 'RAILWAY_DEV_LEVY' 'CUSTOMS_PROCESS_FEE' 'DECLARED_CUSTOMS_VAL'\n",
      " 'ASSESSED_CUSTOMS_VAL' 'ITEM_TAX_AMOUNT' 'dataid' 'src_country' 'iesign'\n",
      " 'version']\n",
      "2025-09-10 02:29:42,953 - Data Mapping - INFO - Export data mapping and cleaning takes 0.02 minutes\n",
      "2025-09-10 02:29:43,139 - Bash Mapping - INFO - target_tab shape: (100956, 47)\n",
      "2025-09-10 02:29:43,215 - Data Mapping - INFO - Export data mapping and cleaning takes 0.04 minutes\n",
      "2025-09-10 02:29:43,335 - Bash Mapping - INFO - target_tab shape: (129629, 47)\n",
      "2025-09-10 02:29:44,166 - File Process - INFO - CSV data saved to dataset/data_20250724/20251t3_csv_save2/乌克兰/EXPORT/UA_EXPORT_202503.csv\n",
      "2025-09-10 02:29:44,286 - Bash Mapping - INFO - dataset/data_20250724/20251t3_src_version/乌克兰/export_paths/UA_EXPORT_202502.csv\n",
      "2025-09-10 02:29:45,519 - Data Mapping - INFO - imp_df columns: ['REG_DATE' 'CUSTOMS_OFFICE_CODE' 'DECLARATION_NUMBER' 'ITEM_NO' 'HS_CODE'\n",
      " 'PRODUCT_DESC' 'CUSTOMS_OFFICE_NAME_3WEI' 'SHIPPER_CODE' 'SHIPPER_NAME'\n",
      " 'GUARANTEE_CODE' 'GUARANTEE_NAME_ADDRESS' 'DEPARTURE_COUNTRY'\n",
      " 'TRADING_COUNTRY' 'DEST_COUNTRY' 'CONSIGNEE_NAME_ADDRESS'\n",
      " 'CONTRACT_HOLDER_NAME' 'PLACE_OF_RECEIPT' 'INCOTERMS'\n",
      " 'TRANS_TYPE_AT_BORDER' 'TRANS_TYPE_WITHIN_COUNTRY' 'UOM' 'QTY'\n",
      " 'N_WEIGHT_IN_KG' 'INVOICE_VALUE_USD' 'CUSTOMS_VALUE_USD'\n",
      " 'DECLARATION_TYPE_CODE' 'DIRECTION' 'PREV_CUSTOMS_MODE_CODE'\n",
      " 'CUSTOMS_MODE' 'dataid' 'src_country' 'iesign' 'version']\n",
      "2025-09-10 02:29:45,553 - File Process - INFO - CSV data saved to dataset/data_20250724/20251t3_csv_save2/智利/EXPORT/202503-EXP.csv\n",
      "2025-09-10 02:29:45,667 - Bash Mapping - INFO - all data takes 0.22 minutes\n",
      "2025-09-10 02:29:45,711 - Data Goverance - INFO - 1\n",
      "2025-09-10 02:29:46,203 - Data Mapping - INFO - Export data mapping and cleaning takes 0.03 minutes\n",
      "2025-09-10 02:29:46,330 - Bash Mapping - INFO - target_tab shape: (130121, 47)\n",
      "2025-09-10 02:29:47,074 - File Process - INFO - CSV data saved to dataset/data_20250724/20251t3_csv_save2/坦桑尼亚/EXPORT/202502-TRANS-RAW.csv\n",
      "2025-09-10 02:29:47,184 - Bash Mapping - INFO - dataset/data_20250724/20251t3_src_version/坦桑尼亚/export_paths/202501-TRANS-RAW.csv\n",
      "2025-09-10 02:29:48,351 - Data Mapping - INFO - imp_df columns: ['CUSTOMS_OFFICE_CODE' 'TANSAD_YY' 'TANSAD_DATE' 'TANSAD_SERIAL_NO'\n",
      " 'TANSAD_FULL_NO' 'REGIME_NAME' 'TANSAD_YYYYMM' 'RELEASE_YYYYMM'\n",
      " 'PROCESSING_STATUS_NAME' 'ORIGIN_COUNTRY' 'CONSIGNMENT_COUNTRY'\n",
      " 'TRADING_COUNTRY' 'EXPORT_COUNTRY' 'DEST_COUNTRY' 'CRN'\n",
      " 'MANIFEST_SUBMISSION_DATE' 'BL_NO' 'IS_CONTAINERIZED' 'NO_OF_CONTAINERS'\n",
      " 'VESSEL_NAME' 'TRANSPORT_MEANS_ID' 'TRANSPORT_NATIONALITY_CD'\n",
      " 'ARRIVAL_DATE' 'DISCHARGE_DATE' 'ENTRY_OFFICE_CODE'\n",
      " 'SELECTIVITY_RESULT_CD' 'CLEARANCE_PLAN_CD' 'TRANSPORT_MODE_CODE'\n",
      " 'TRANSPORT_MODE_NAME' 'IMPORTER_TAX_ID' 'IMPORTER_NAME' 'EXPORTER_TAX_ID'\n",
      " 'EXPORTER_NAME' 'DECLARANT_TAX_ID' 'DECLARANT_NAME'\n",
      " 'ITEM_INVOICE_CARGO_VALUE_FOREIGN_CURRENCY' 'FOREIGN_CURRENCY'\n",
      " 'EXCHANGE_RATE' 'PREFERENCE_CODE' 'ITEM_NO' 'EPC_CODE' 'APC_CODE'\n",
      " 'HS_CODE' 'HS_CODE_DESC' 'PRODUCT_DESC' 'USED_YN' 'QUANTITY'\n",
      " 'UNIT_OF_MEASURE' 'TOTAL_FREIGHT_AMOUNT' 'ITEM_G_WEIGHT' 'ITEM_N_WEIGHT'\n",
      " 'ITEM_CARGO_VALUE_STATISTICS' 'ITEM_CUSTOMS_CARGO_VALUE' 'IMPORT_DUTY'\n",
      " 'IMPORT_DUTY_EXEMPTED' 'EXPORT_DUTY' 'EXPORT_DUTY_EXEMPTED' 'EXCISE_DUTY'\n",
      " 'EXCISE_DUTY_EXEMPTED' 'VAT' 'VAT_EXEMPTED' 'FUEL_LEVY' 'ELV' 'EXP'\n",
      " 'VEH_REG' 'RAILWAY_DEV_LEVY' 'CUSTOMS_PROCESS_FEE' 'DECLARED_CUSTOMS_VAL'\n",
      " 'ASSESSED_CUSTOMS_VAL' 'ITEM_TAX_AMOUNT' 'dataid' 'src_country' 'iesign'\n",
      " 'version']\n",
      "2025-09-10 02:29:48,681 - Data Mapping - INFO - Export data mapping and cleaning takes 0.02 minutes\n",
      "2025-09-10 02:29:48,809 - Bash Mapping - INFO - target_tab shape: (152034, 47)\n",
      "2025-09-10 02:29:50,293 - File Process - INFO - CSV data saved to dataset/data_20250724/20251t3_csv_save2/乌克兰/EXPORT/UA_EXPORT_202502.csv\n",
      "2025-09-10 02:29:50,406 - Bash Mapping - INFO - dataset/data_20250724/20251t3_src_version/乌克兰/import_paths/UA_EXPORT_202503.csv\n",
      "2025-09-10 02:29:50,981 - File Process - INFO - CSV data saved to dataset/data_20250724/20251t3_csv_save2/坦桑尼亚/EXPORT/202501-TRANS-RAW.csv\n",
      "2025-09-10 02:29:51,101 - Bash Mapping - INFO - dataset/data_20250724/20251t3_src_version/坦桑尼亚/export_paths/202502-EXP-RAW.csv\n",
      "2025-09-10 02:29:51,156 - Data Mapping - INFO - imp_df columns: ['CUSTOMS_OFFICE_CODE' 'TANSAD_YY' 'TANSAD_DATE' 'TANSAD_SERIAL_NO'\n",
      " 'TANSAD_FULL_NO' 'REGIME_NAME' 'TANSAD_YYYYMM' 'RELEASE_YYYYMM'\n",
      " 'PROCESSING_STATUS_NAME' 'ORIGIN_COUNTRY' 'CONSIGNMENT_COUNTRY'\n",
      " 'TRADING_COUNTRY' 'EXPORT_COUNTRY' 'DEST_COUNTRY' 'CRN'\n",
      " 'MANIFEST_SUBMISSION_DATE' 'BL_NO' 'IS_CONTAINERIZED' 'NO_OF_CONTAINERS'\n",
      " 'VESSEL_NAME' 'TRANSPORT_MEANS_ID' 'TRANSPORT_NATIONALITY_CD'\n",
      " 'ARRIVAL_DATE' 'DISCHARGE_DATE' 'ENTRY_OFFICE_CODE'\n",
      " 'SELECTIVITY_RESULT_CD' 'CLEARANCE_PLAN_CD' 'TRANSPORT_MODE_CODE'\n",
      " 'TRANSPORT_MODE_NAME' 'IMPORTER_TAX_ID' 'IMPORTER_NAME' 'EXPORTER_TAX_ID'\n",
      " 'EXPORTER_NAME' 'DECLARANT_TAX_ID' 'DECLARANT_NAME'\n",
      " 'ITEM_INVOICE_CARGO_VALUE_FOREIGN_CURRENCY' 'FOREIGN_CURRENCY'\n",
      " 'EXCHANGE_RATE' 'PREFERENCE_CODE' 'ITEM_NO' 'EPC_CODE' 'APC_CODE'\n",
      " 'HS_CODE' 'HS_CODE_DESC' 'PRODUCT_DESC' 'USED_YN' 'QUANTITY'\n",
      " 'UNIT_OF_MEASURE' 'TOTAL_FREIGHT_AMOUNT' 'ITEM_G_WEIGHT' 'ITEM_N_WEIGHT'\n",
      " 'ITEM_CARGO_VALUE_STATISTICS' 'ITEM_CUSTOMS_CARGO_VALUE' 'IMPORT_DUTY'\n",
      " 'IMPORT_DUTY_EXEMPTED' 'EXPORT_DUTY' 'EXPORT_DUTY_EXEMPTED' 'EXCISE_DUTY'\n",
      " 'EXCISE_DUTY_EXEMPTED' 'VAT' 'VAT_EXEMPTED' 'FUEL_LEVY' 'ELV' 'EXP'\n",
      " 'VEH_REG' 'RAILWAY_DEV_LEVY' 'CUSTOMS_PROCESS_FEE' 'DECLARED_CUSTOMS_VAL'\n",
      " 'ASSESSED_CUSTOMS_VAL' 'ITEM_TAX_AMOUNT' 'dataid' 'src_country' 'iesign'\n",
      " 'version']\n",
      "2025-09-10 02:29:51,178 - Data Mapping - INFO - Export data mapping and cleaning takes 0.0 minutes\n",
      "2025-09-10 02:29:51,183 - Bash Mapping - INFO - target_tab shape: (7660, 47)\n",
      "2025-09-10 02:29:51,305 - File Process - INFO - CSV data saved to dataset/data_20250724/20251t3_csv_save2/坦桑尼亚/EXPORT/202502-EXP-RAW.csv\n",
      "2025-09-10 02:29:51,373 - Bash Mapping - INFO - all data takes 0.31 minutes\n",
      "2025-09-10 02:29:51,374 - Data Goverance - INFO - 1\n",
      "2025-09-10 02:29:51,965 - Data Mapping - INFO - imp_df columns: ['REG_DATE' 'CUSTOMS_OFFICE_CODE' 'DECLARATION_NUMBER' 'ITEM_NO' 'HS_CODE'\n",
      " 'PRODUCT_DESC' 'CUSTOMS_OFFICE_NAME_3WEI' 'SHIPPER_CODE' 'SHIPPER_NAME'\n",
      " 'GUARANTEE_CODE' 'GUARANTEE_NAME_ADDRESS' 'DEPARTURE_COUNTRY'\n",
      " 'TRADING_COUNTRY' 'DEST_COUNTRY' 'CONSIGNEE_NAME_ADDRESS'\n",
      " 'CONTRACT_HOLDER_NAME' 'PLACE_OF_RECEIPT' 'INCOTERMS'\n",
      " 'TRANS_TYPE_AT_BORDER' 'TRANS_TYPE_WITHIN_COUNTRY' 'UOM' 'QTY'\n",
      " 'N_WEIGHT_IN_KG' 'INVOICE_VALUE_USD' 'CUSTOMS_VALUE_USD'\n",
      " 'DECLARATION_TYPE_CODE' 'DIRECTION' 'PREV_CUSTOMS_MODE_CODE'\n",
      " 'CUSTOMS_MODE' 'dataid' 'src_country' 'iesign' 'version']\n",
      "2025-09-10 02:29:52,555 - Data Mapping - INFO - Export data mapping and cleaning takes 0.04 minutes\n",
      "2025-09-10 02:29:52,696 - Bash Mapping - INFO - target_tab shape: (141342, 47)\n",
      "2025-09-10 02:29:54,497 - File Process - INFO - CSV data saved to dataset/data_20250724/20251t3_csv_save2/乌克兰/EXPORT/UA_EXPORT_202503.csv\n",
      "2025-09-10 02:29:54,657 - Bash Mapping - INFO - dataset/data_20250724/20251t3_src_version/乌克兰/import_paths/UA_EXPORT_202502.csv\n",
      "2025-09-10 02:29:55,654 - Data Mapping - INFO - imp_df columns: ['REG_DATE' 'CUSTOMS_OFFICE_CODE' 'DECLARATION_NUMBER' 'ITEM_NO' 'HS_CODE'\n",
      " 'PRODUCT_DESC' 'CUSTOMS_OFFICE_NAME_3WEI' 'SHIPPER_CODE' 'SHIPPER_NAME'\n",
      " 'GUARANTEE_CODE' 'GUARANTEE_NAME_ADDRESS' 'DEPARTURE_COUNTRY'\n",
      " 'TRADING_COUNTRY' 'DEST_COUNTRY' 'CONSIGNEE_NAME_ADDRESS'\n",
      " 'CONTRACT_HOLDER_NAME' 'PLACE_OF_RECEIPT' 'INCOTERMS'\n",
      " 'TRANS_TYPE_AT_BORDER' 'TRANS_TYPE_WITHIN_COUNTRY' 'UOM' 'QTY'\n",
      " 'N_WEIGHT_IN_KG' 'INVOICE_VALUE_USD' 'CUSTOMS_VALUE_USD'\n",
      " 'DECLARATION_TYPE_CODE' 'DIRECTION' 'PREV_CUSTOMS_MODE_CODE'\n",
      " 'CUSTOMS_MODE' 'dataid' 'src_country' 'iesign' 'version']\n",
      "2025-09-10 02:29:56,213 - Data Mapping - INFO - Export data mapping and cleaning takes 0.03 minutes\n",
      "2025-09-10 02:29:56,335 - Bash Mapping - INFO - target_tab shape: (130121, 47)\n",
      "2025-09-10 02:29:58,024 - File Process - INFO - CSV data saved to dataset/data_20250724/20251t3_csv_save2/乌克兰/EXPORT/UA_EXPORT_202502.csv\n",
      "2025-09-10 02:29:58,174 - Bash Mapping - INFO - all data takes 0.43 minutes\n",
      "2025-09-10 02:29:58,174 - Data Goverance - INFO - 1\n",
      "2025-09-10 02:29:58,175 - Data Goverance - INFO - country_normal_mapping takes 0.42606507539749144 minutes\n"
     ]
    }
   ],
   "source": [
    "# 出口跑批\n",
    "start = time.time()\n",
    "\n",
    "params_list = [([ctry], root_dir, csv_files_path, all_country_cols_dict) for ctry in country_list]\n",
    "with ThreadPoolExecutor(max_workers=10) as executor:\n",
    "    # 提交多个任务\n",
    "    futures = [\n",
    "        executor.submit(lambda p: bash_mapping_export(*p), params)\n",
    "        for params in params_list\n",
    "    ]\n",
    "    # 按完成顺序获取结果\n",
    "    for future in as_completed(futures):\n",
    "        logger.info(future.result())\n",
    "        # pass\n",
    "\n",
    "end = time.time()\n",
    "logger.info(f'country_normal_mapping takes {(end-start)/60} minutes')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c960344b-a014-448c-9d24-221c07d668eb",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "93098180-1531-43b7-87b9-3ce668e685ab",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "57029054-4736-4783-a7d6-3418e0346157",
   "metadata": {
    "scrolled": true
   },
   "source": [
    "### 5.2.2 进口跑批"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "48dec95f-ee93-4b11-937c-5d7ec4fb16b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 获取文件目录，区分出口、进口数据\n",
    "root_dir = Path('dataset/data_20250724/20251t3_src_version')\n",
    "\n",
    "# csv结果文件保存路径\n",
    "# csv_files_path = Path('dataset/data_20250724/20251t3_csv_save')\n",
    "csv_files_path = Path('dataset/data_20250724/20251t3_csv_save2')\n",
    "\n",
    "country_list = ['智利', '坦桑尼亚', '乌克兰']\n",
    "# country_list = ['智利']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "e2fe8ada-a656-4396-8ffd-c6eafb1050e4",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2025-09-10 02:29:58,214 - Bash Mapping - INFO - 智利\n",
      "2025-09-10 02:29:58,214 - Bash Mapping - INFO - dataset/data_20250724/20251t3_src_version/智利/import_paths/202501-IMP.csv\n",
      "2025-09-10 02:29:58,216 - Bash Mapping - INFO - 乌克兰\n",
      "2025-09-10 02:29:58,216 - Bash Mapping - INFO - dataset/data_20250724/20251t3_src_version/乌克兰/import_paths/UA_IMPORT_202501.csv\n",
      "2025-09-10 02:29:58,216 - Bash Mapping - INFO - 坦桑尼亚\n",
      "2025-09-10 02:29:58,217 - Bash Mapping - INFO - dataset/data_20250724/20251t3_src_version/坦桑尼亚/import_paths/202501-IMP-RAW.csv\n",
      "2025-09-10 02:30:02,553 - Data Mapping - INFO - imp_df columns: ['DAY' 'MONTH' 'YEAR' 'CUSTOMS' 'CONTROL_ID' 'IMPORTER_ID' 'VERIFIER'\n",
      " 'IMPORTER' 'HS_CODE' 'HS_CODE_DESC' 'PRODUCT' 'BRAND' 'VARIETEY' 'DESC'\n",
      " 'ORIGIN_COUNTRY' 'SALES_COUNTRY' 'TRANS_TYPE' 'PAYMENT' 'ORIGIN_PORT'\n",
      " 'DEST_PORT' 'TRANS_CORP' 'TYPE_OF_LOAD' 'TYPE_OF_PACKAGE' 'G_WEIGHT'\n",
      " 'INCOTERMS' 'TAX' 'QUANTITY' 'UNIT_OF_QUANTITY' 'FOB' 'FREIGHT'\n",
      " 'INSURANCE' 'CIF' 'CIF_UNIT' 'FOB_UNIT' 'TRANS_CORP_COUNTRY' 'US_TAX'\n",
      " 'PACKAGES' 'ECONOMIC_ZONE' 'IMPORTER_ECONOMIC_KEY' 'MANIFEST_NUMBER'\n",
      " 'MANIFEST_DATE' 'TRANS_DOC_NUMBER' 'TRANS_DOC_DATE' 'dataid'\n",
      " 'src_country' 'iesign' 'version']\n",
      "2025-09-10 02:30:03,659 - Data Mapping - INFO - Import data mapping and cleaning takes 0.09 minutes\n",
      "2025-09-10 02:30:04,028 - Bash Mapping - INFO - target_tab shape: (383012, 47)\n",
      "2025-09-10 02:30:05,668 - Data Mapping - INFO - imp_df columns: ['CUSTOMS_OFFICE_CODE' 'TANSAD_YY' 'TANSAD_DATE' 'TANSAD_SERIAL_NO'\n",
      " 'TANSAD_FULL_NO' 'REGIME_NAME' 'TANSAD_YYYYMM' 'RELEASE_YYYYMM'\n",
      " 'PROCESSING_STATUS_NAME' 'ORIGIN_COUNTRY' 'CONSIGNMENT_COUNTRY'\n",
      " 'TRADING_COUNTRY' 'EXPORT_COUNTRY' 'DEST_COUNTRY' 'CRN'\n",
      " 'MANIFEST_SUBMISSION_DATE' 'BL_NO' 'IS_CONTAINERIZED' 'NO_OF_CONTAINERS'\n",
      " 'VESSEL_NAME' 'TRANSPORT_MEANS_ID' 'TRANSPORT_NATIONALITY_CD'\n",
      " 'ARRIVAL_DATE' 'DISCHARGE_DATE' 'ENTRY_OFFICE_CODE'\n",
      " 'SELECTIVITY_RESULT_CD' 'CLEARANCE_PLAN_CD' 'TRANSPORT_MODE_CODE'\n",
      " 'TRANSPORT_MODE_NAME' 'IMPORTER_TAX_ID' 'IMPORTER_NAME' 'EXPORTER_TAX_ID'\n",
      " 'EXPORTER_NAME' 'DECLARANT_TAX_ID' 'DECLARANT_NAME'\n",
      " 'ITEM_INVOICE_CARGO_VALUE_FOREIGN_CURRENCY' 'FOREIGN_CURRENCY'\n",
      " 'EXCHANGE_RATE' 'PREFERENCE_CODE' 'ITEM_NO' 'EPC_CODE' 'APC_CODE'\n",
      " 'HS_CODE' 'HS_CODE_DESC' 'PRODUCT_DESC' 'USED_YN' 'QUANTITY'\n",
      " 'UNIT_OF_MEASURE' 'TOTAL_FREIGHT_AMOUNT' 'ITEM_G_WEIGHT' 'ITEM_N_WEIGHT'\n",
      " 'ITEM_CARGO_VALUE_STATISTICS' 'ITEM_CUSTOMS_CARGO_VALUE' 'IMPORT_DUTY'\n",
      " 'IMPORT_DUTY_EXEMPTED' 'EXPORT_DUTY' 'EXPORT_DUTY_EXEMPTED' 'EXCISE_DUTY'\n",
      " 'EXCISE_DUTY_EXEMPTED' 'VAT' 'VAT_EXEMPTED' 'FUEL_LEVY' 'ELV' 'EXP'\n",
      " 'VEH_REG' 'RAILWAY_DEV_LEVY' 'CUSTOMS_PROCESS_FEE' 'DECLARED_CUSTOMS_VAL'\n",
      " 'ASSESSED_CUSTOMS_VAL' 'ITEM_TAX_AMOUNT' 'dataid' 'src_country' 'iesign'\n",
      " 'version']\n",
      "2025-09-10 02:30:06,791 - Data Mapping - INFO - Import data mapping and cleaning takes 0.14 minutes\n",
      "2025-09-10 02:30:07,242 - Bash Mapping - INFO - target_tab shape: (432559, 47)\n",
      "2025-09-10 02:30:11,880 - Data Mapping - INFO - imp_df columns: ['REG_DATE' 'CUSTOMS_OFFICE_CODE' 'DECLARATION_NUMBER' 'ITEM_NO' 'HS_CODE'\n",
      " 'PRODUCT_DESC' 'CUSTOMS_OFFICE_NAME_3WEI' 'CONSIGNEE_CODE'\n",
      " 'CONSIGNEE_NAME_ADDRESS' 'GUARANTEE_CODE' 'GUARANTEE_NAME_ADDRESS'\n",
      " 'DEPARTURE_COUNTRY' 'TRADING_COUNTRY' 'ORIGIN_COUNTRY' 'SHIPPER_NAME'\n",
      " 'CONTRACT_HOLDER_NAME' 'PLACE_OF_DELIVERY' 'INCOTERMS'\n",
      " 'TRANS_TYPE_AT_BORDER' 'TRANS_TYPE_WITHIN_COUNTRY' 'UNIT_OF_QUANTITY'\n",
      " 'QUANTITY' 'N_WEIGHT_IN_KG' 'INVOICE_VALUE_USD' 'CUSTOMS_VALUE_USD'\n",
      " 'DECLARATION_TYPE_CODE' 'DIRECTION' 'PREV_CUSTOMS_MODE_CODE'\n",
      " 'CUSTOMS_MODE' 'dataid' 'src_country' 'iesign' 'version']\n",
      "2025-09-10 02:30:14,655 - Data Mapping - INFO - Import data mapping and cleaning takes 0.27 minutes\n",
      "2025-09-10 02:30:15,469 - Bash Mapping - INFO - target_tab shape: (652325, 47)\n",
      "2025-09-10 02:30:19,497 - File Process - INFO - CSV data saved to dataset/data_20250724/20251t3_csv_save2/智利/IMPORT/202501-IMP.csv\n",
      "2025-09-10 02:30:19,742 - Bash Mapping - INFO - dataset/data_20250724/20251t3_src_version/智利/import_paths/202502-IMP.csv\n",
      "2025-09-10 02:30:20,487 - File Process - INFO - CSV data saved to dataset/data_20250724/20251t3_csv_save2/坦桑尼亚/IMPORT/202501-IMP-RAW.csv\n",
      "2025-09-10 02:30:20,594 - Bash Mapping - INFO - dataset/data_20250724/20251t3_src_version/坦桑尼亚/import_paths/202503-IMP-RAW.csv\n",
      "2025-09-10 02:30:22,290 - Data Mapping - INFO - imp_df columns: ['DAY' 'MONTH' 'YEAR' 'CUSTOMS' 'CONTROL_ID' 'IMPORTER_ID' 'VERIFIER'\n",
      " 'IMPORTER' 'HS_CODE' 'HS_CODE_DESC' 'PRODUCT' 'BRAND' 'VARIETEY' 'DESC'\n",
      " 'ORIGIN_COUNTRY' 'SALES_COUNTRY' 'TRANS_TYPE' 'PAYMENT' 'ORIGIN_PORT'\n",
      " 'DEST_PORT' 'TRANS_CORP' 'TYPE_OF_LOAD' 'TYPE_OF_PACKAGE' 'G_WEIGHT'\n",
      " 'INCOTERMS' 'TAX' 'QUANTITY' 'UNIT_OF_QUANTITY' 'FOB' 'FREIGHT'\n",
      " 'INSURANCE' 'CIF' 'CIF_UNIT' 'FOB_UNIT' 'TRANS_CORP_COUNTRY' 'US_TAX'\n",
      " 'PACKAGES' 'ECONOMIC_ZONE' 'IMPORTER_ECONOMIC_KEY' 'MANIFEST_NUMBER'\n",
      " 'MANIFEST_DATE' 'TRANS_DOC_NUMBER' 'TRANS_DOC_DATE' 'dataid'\n",
      " 'src_country' 'iesign' 'version']\n",
      "2025-09-10 02:30:23,222 - Data Mapping - INFO - Import data mapping and cleaning takes 0.06 minutes\n",
      "2025-09-10 02:30:23,551 - Bash Mapping - INFO - target_tab shape: (354974, 47)\n",
      "2025-09-10 02:30:25,675 - Data Mapping - INFO - imp_df columns: ['CUSTOMS_OFFICE_CODE' 'TANSAD_YY' 'TANSAD_DATE' 'TANSAD_SERIAL_NO'\n",
      " 'TANSAD_FULL_NO' 'REGIME_NAME' 'TANSAD_YYYYMM' 'RELEASE_YYYYMM'\n",
      " 'PROCESSING_STATUS_NAME' 'ORIGIN_COUNTRY' 'CONSIGNMENT_COUNTRY'\n",
      " 'TRADING_COUNTRY' 'EXPORT_COUNTRY' 'DEST_COUNTRY' 'CRN'\n",
      " 'MANIFEST_SUBMISSION_DATE' 'BL_NO' 'IS_CONTAINERIZED' 'NO_OF_CONTAINERS'\n",
      " 'VESSEL_NAME' 'TRANSPORT_MEANS_ID' 'TRANSPORT_NATIONALITY_CD'\n",
      " 'ARRIVAL_DATE' 'DISCHARGE_DATE' 'ENTRY_OFFICE_CODE'\n",
      " 'SELECTIVITY_RESULT_CD' 'CLEARANCE_PLAN_CD' 'TRANSPORT_MODE_CODE'\n",
      " 'TRANSPORT_MODE_NAME' 'IMPORTER_TAX_ID' 'IMPORTER_NAME' 'EXPORTER_TAX_ID'\n",
      " 'EXPORTER_NAME' 'DECLARANT_TAX_ID' 'DECLARANT_NAME'\n",
      " 'ITEM_INVOICE_CARGO_VALUE_FOREIGN_CURRENCY' 'FOREIGN_CURRENCY'\n",
      " 'EXCHANGE_RATE' 'PREFERENCE_CODE' 'ITEM_NO' 'EPC_CODE' 'APC_CODE'\n",
      " 'HS_CODE' 'HS_CODE_DESC' 'PRODUCT_DESC' 'USED_YN' 'QUANTITY'\n",
      " 'UNIT_OF_MEASURE' 'TOTAL_FREIGHT_AMOUNT' 'ITEM_G_WEIGHT' 'ITEM_N_WEIGHT'\n",
      " 'ITEM_CARGO_VALUE_STATISTICS' 'ITEM_CUSTOMS_CARGO_VALUE' 'IMPORT_DUTY'\n",
      " 'IMPORT_DUTY_EXEMPTED' 'EXPORT_DUTY' 'EXPORT_DUTY_EXEMPTED' 'EXCISE_DUTY'\n",
      " 'EXCISE_DUTY_EXEMPTED' 'VAT' 'VAT_EXEMPTED' 'FUEL_LEVY' 'ELV' 'EXP'\n",
      " 'VEH_REG' 'RAILWAY_DEV_LEVY' 'CUSTOMS_PROCESS_FEE' 'DECLARED_CUSTOMS_VAL'\n",
      " 'ASSESSED_CUSTOMS_VAL' 'ITEM_TAX_AMOUNT' 'dataid' 'src_country' 'iesign'\n",
      " 'version']\n",
      "2025-09-10 02:30:26,472 - Data Mapping - INFO - Import data mapping and cleaning takes 0.1 minutes\n",
      "2025-09-10 02:30:26,848 - Bash Mapping - INFO - target_tab shape: (345735, 47)\n",
      "2025-09-10 02:30:32,890 - File Process - INFO - CSV data saved to dataset/data_20250724/20251t3_csv_save2/智利/IMPORT/202502-IMP.csv\n",
      "2025-09-10 02:30:33,008 - Bash Mapping - INFO - dataset/data_20250724/20251t3_src_version/智利/import_paths/202503-IMP.csv\n",
      "2025-09-10 02:30:33,378 - File Process - INFO - CSV data saved to dataset/data_20250724/20251t3_csv_save2/坦桑尼亚/IMPORT/202503-IMP-RAW.csv\n",
      "2025-09-10 02:30:33,500 - Bash Mapping - INFO - dataset/data_20250724/20251t3_src_version/坦桑尼亚/import_paths/202502-IMP-RAW.csv\n",
      "2025-09-10 02:30:36,078 - Data Mapping - INFO - imp_df columns: ['DAY' 'MONTH' 'YEAR' 'CUSTOMS' 'CONTROL_ID' 'IMPORTER_ID' 'VERIFIER'\n",
      " 'IMPORTER' 'HS_CODE' 'HS_CODE_DESC' 'PRODUCT' 'BRAND' 'VARIETEY' 'DESC'\n",
      " 'ORIGIN_COUNTRY' 'SALES_COUNTRY' 'TRANS_TYPE' 'PAYMENT' 'ORIGIN_PORT'\n",
      " 'DEST_PORT' 'TRANS_CORP' 'TYPE_OF_LOAD' 'TYPE_OF_PACKAGE' 'G_WEIGHT'\n",
      " 'INCOTERMS' 'TAX' 'QUANTITY' 'UNIT_OF_QUANTITY' 'FOB' 'FREIGHT'\n",
      " 'INSURANCE' 'CIF' 'CIF_UNIT' 'FOB_UNIT' 'TRANS_CORP_COUNTRY' 'US_TAX'\n",
      " 'PACKAGES' 'ECONOMIC_ZONE' 'IMPORTER_ECONOMIC_KEY' 'MANIFEST_NUMBER'\n",
      " 'MANIFEST_DATE' 'TRANS_DOC_NUMBER' 'TRANS_DOC_DATE' 'dataid'\n",
      " 'src_country' 'iesign' 'version']\n",
      "2025-09-10 02:30:37,090 - Data Mapping - INFO - Import data mapping and cleaning takes 0.07 minutes\n",
      "2025-09-10 02:30:37,483 - Bash Mapping - INFO - target_tab shape: (385873, 47)\n",
      "2025-09-10 02:30:39,413 - Data Mapping - INFO - imp_df columns: ['CUSTOMS_OFFICE_CODE' 'TANSAD_YY' 'TANSAD_DATE' 'TANSAD_SERIAL_NO'\n",
      " 'TANSAD_FULL_NO' 'REGIME_NAME' 'TANSAD_YYYYMM' 'RELEASE_YYYYMM'\n",
      " 'PROCESSING_STATUS_NAME' 'ORIGIN_COUNTRY' 'CONSIGNMENT_COUNTRY'\n",
      " 'TRADING_COUNTRY' 'EXPORT_COUNTRY' 'DEST_COUNTRY' 'CRN'\n",
      " 'MANIFEST_SUBMISSION_DATE' 'BL_NO' 'IS_CONTAINERIZED' 'NO_OF_CONTAINERS'\n",
      " 'VESSEL_NAME' 'TRANSPORT_MEANS_ID' 'TRANSPORT_NATIONALITY_CD'\n",
      " 'ARRIVAL_DATE' 'DISCHARGE_DATE' 'ENTRY_OFFICE_CODE'\n",
      " 'SELECTIVITY_RESULT_CD' 'CLEARANCE_PLAN_CD' 'TRANSPORT_MODE_CODE'\n",
      " 'TRANSPORT_MODE_NAME' 'IMPORTER_TAX_ID' 'IMPORTER_NAME' 'EXPORTER_TAX_ID'\n",
      " 'EXPORTER_NAME' 'DECLARANT_TAX_ID' 'DECLARANT_NAME'\n",
      " 'ITEM_INVOICE_CARGO_VALUE_FOREIGN_CURRENCY' 'FOREIGN_CURRENCY'\n",
      " 'EXCHANGE_RATE' 'PREFERENCE_CODE' 'ITEM_NO' 'EPC_CODE' 'APC_CODE'\n",
      " 'HS_CODE' 'HS_CODE_DESC' 'PRODUCT_DESC' 'USED_YN' 'QUANTITY'\n",
      " 'UNIT_OF_MEASURE' 'TOTAL_FREIGHT_AMOUNT' 'ITEM_G_WEIGHT' 'ITEM_N_WEIGHT'\n",
      " 'ITEM_CARGO_VALUE_STATISTICS' 'ITEM_CUSTOMS_CARGO_VALUE' 'IMPORT_DUTY'\n",
      " 'IMPORT_DUTY_EXEMPTED' 'EXPORT_DUTY' 'EXPORT_DUTY_EXEMPTED' 'EXCISE_DUTY'\n",
      " 'EXCISE_DUTY_EXEMPTED' 'VAT' 'VAT_EXEMPTED' 'FUEL_LEVY' 'ELV' 'EXP'\n",
      " 'VEH_REG' 'RAILWAY_DEV_LEVY' 'CUSTOMS_PROCESS_FEE' 'DECLARED_CUSTOMS_VAL'\n",
      " 'ASSESSED_CUSTOMS_VAL' 'ITEM_TAX_AMOUNT' 'dataid' 'src_country' 'iesign'\n",
      " 'version']\n",
      "2025-09-10 02:30:40,231 - Data Mapping - INFO - Import data mapping and cleaning takes 0.11 minutes\n",
      "2025-09-10 02:30:40,610 - Bash Mapping - INFO - target_tab shape: (359832, 47)\n",
      "2025-09-10 02:30:48,850 - File Process - INFO - CSV data saved to dataset/data_20250724/20251t3_csv_save2/坦桑尼亚/IMPORT/202502-IMP-RAW.csv\n",
      "2025-09-10 02:30:48,952 - Bash Mapping - INFO - all data takes 0.85 minutes\n",
      "2025-09-10 02:30:48,953 - Data Goverance - INFO - 1\n",
      "2025-09-10 02:30:49,558 - File Process - INFO - CSV data saved to dataset/data_20250724/20251t3_csv_save2/智利/IMPORT/202503-IMP.csv\n",
      "2025-09-10 02:30:49,669 - Bash Mapping - INFO - all data takes 0.86 minutes\n",
      "2025-09-10 02:30:49,671 - Data Goverance - INFO - 1\n",
      "2025-09-10 02:30:49,917 - File Process - INFO - CSV data saved to dataset/data_20250724/20251t3_csv_save2/乌克兰/IMPORT/UA_IMPORT_202501.csv\n",
      "2025-09-10 02:30:50,109 - Bash Mapping - INFO - dataset/data_20250724/20251t3_src_version/乌克兰/import_paths/UA_IMPORT_202503.csv\n",
      "2025-09-10 02:30:53,682 - Data Mapping - INFO - imp_df columns: ['REG_DATE' 'CUSTOMS_OFFICE_CODE' 'DECLARATION_NUMBER' 'ITEM_NO' 'HS_CODE'\n",
      " 'PRODUCT_DESC' 'CUSTOMS_OFFICE_NAME_3WEI' 'CONSIGNEE_CODE'\n",
      " 'CONSIGNEE_NAME_ADDRESS' 'GUARANTEE_CODE' 'GUARANTEE_NAME_ADDRESS'\n",
      " 'DEPARTURE_COUNTRY' 'TRADING_COUNTRY' 'ORIGIN_COUNTRY' 'SHIPPER_NAME'\n",
      " 'CONTRACT_HOLDER_NAME' 'PLACE_OF_DELIVERY' 'INCOTERMS'\n",
      " 'TRANS_TYPE_AT_BORDER' 'TRANS_TYPE_WITHIN_COUNTRY' 'UOM' 'QTY'\n",
      " 'N_WEIGHT_IN_KG' 'INVOICE_VALUE_USD' 'CUSTOMS_VALUE_USD'\n",
      " 'DECLARATION_TYPE_CODE' 'DIRECTION' 'PREV_CUSTOMS_MODE_CODE'\n",
      " 'CUSTOMS_MODE' 'dataid' 'src_country' 'iesign' 'version']\n",
      "2025-09-10 02:30:56,341 - Data Mapping - INFO - Import data mapping and cleaning takes 0.1 minutes\n",
      "2025-09-10 02:30:57,158 - Bash Mapping - INFO - target_tab shape: (771565, 47)\n",
      "2025-09-10 02:31:06,655 - File Process - INFO - CSV data saved to dataset/data_20250724/20251t3_csv_save2/乌克兰/IMPORT/UA_IMPORT_202503.csv\n",
      "2025-09-10 02:31:06,952 - Bash Mapping - INFO - dataset/data_20250724/20251t3_src_version/乌克兰/import_paths/UA_IMPORT_202502.csv\n",
      "2025-09-10 02:31:10,255 - Data Mapping - INFO - imp_df columns: ['REG_DATE' 'CUSTOMS_OFFICE_CODE' 'DECLARATION_NUMBER' 'ITEM_NO' 'HS_CODE'\n",
      " 'PRODUCT_DESC' 'CUSTOMS_OFFICE_NAME_3WEI' 'CONSIGNEE_CODE'\n",
      " 'CONSIGNEE_NAME_ADDRESS' 'GUARANTEE_CODE' 'GUARANTEE_NAME_ADDRESS'\n",
      " 'DEPARTURE_COUNTRY' 'TRADING_COUNTRY' 'ORIGIN_COUNTRY' 'SHIPPER_NAME'\n",
      " 'CONTRACT_HOLDER_NAME' 'PLACE_OF_DELIVERY' 'INCOTERMS'\n",
      " 'TRANS_TYPE_AT_BORDER' 'TRANS_TYPE_WITHIN_COUNTRY' 'UOM' 'QTY'\n",
      " 'N_WEIGHT_IN_KG' 'INVOICE_VALUE_USD' 'CUSTOMS_VALUE_USD'\n",
      " 'DECLARATION_TYPE_CODE' 'DIRECTION' 'PREV_CUSTOMS_MODE_CODE'\n",
      " 'CUSTOMS_MODE' 'dataid' 'src_country' 'iesign' 'version']\n",
      "2025-09-10 02:31:12,511 - Data Mapping - INFO - Import data mapping and cleaning takes 0.09 minutes\n",
      "2025-09-10 02:31:13,243 - Bash Mapping - INFO - target_tab shape: (675826, 47)\n",
      "2025-09-10 02:31:21,507 - File Process - INFO - CSV data saved to dataset/data_20250724/20251t3_csv_save2/乌克兰/IMPORT/UA_IMPORT_202502.csv\n",
      "2025-09-10 02:31:21,757 - Bash Mapping - INFO - all data takes 1.39 minutes\n",
      "2025-09-10 02:31:21,757 - Data Goverance - INFO - 1\n",
      "2025-09-10 02:31:21,768 - Data Goverance - INFO - country_normal_mapping takes 1.3929506023724874 minutes\n"
     ]
    }
   ],
   "source": [
    "# 进口跑批\n",
    "start = time.time()\n",
    "\n",
    "params_list = [([ctry], root_dir, csv_files_path, all_country_cols_dict) for ctry in country_list]\n",
    "with ThreadPoolExecutor(max_workers=10) as executor:\n",
    "    # 提交多个任务\n",
    "    futures = [\n",
    "        executor.submit(lambda p: bash_mapping_import(*p), params)\n",
    "        for params in params_list\n",
    "    ]\n",
    "    # 按完成顺序获取结果\n",
    "    for future in as_completed(futures):\n",
    "        logger.info(future.result())\n",
    "\n",
    "end = time.time()\n",
    "logger.info(f'country_normal_mapping takes {(end-start)/60} minutes')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c2f27e0d-c342-438c-b148-ea7e48fd5f08",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "28bcc84d-20ff-4f10-b8c5-304ed5cc36ea",
   "metadata": {},
   "source": [
    "## 5.3 乌克兰1月出口单独处理"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a64ec6c7-dc54-4a63-a8f0-1b4548860150",
   "metadata": {},
   "source": [
    "数据来源路径： 'dataset/data_20250724/2025年1-3月汇总/乌克兰/Ukraine_E_2025.01.xlsx'\n",
    "\n",
    "添加数据主键 dataid\n",
    "添加数据主键 country\n",
    "添加数据主键 iesign\n",
    "源数据提取为csv路径： 'dataset/data_20250724/2025年1-3月汇总/乌克兰/Ukraine_E_2025.01.csv'\n",
    "\n",
    "mapping结果文件保存路径： 'dataset/data_20250724/20251t3_csv_save/乌克兰/EXPORT/UA_EXPORT_202501.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "da4cff0c-5076-4ab6-a201-b890ebb1f73c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2025-09-10 02:31:53,260 - Data Goverance - INFO - table shape: (115577, 54)\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "excel_file = pd.ExcelFile('dataset/data_20250724/2025年1-3月汇总/乌克兰/Ukraine_E_2025.01.xlsx')\n",
    "sheet_names = excel_file.sheet_names\n",
    "exp_df = pd.read_excel('dataset/data_20250724/2025年1-3月汇总/乌克兰/Ukraine_E_2025.01.xlsx', sheet_name=sheet_names[0])\n",
    "\n",
    "exp_df['dataid'] = [str(uuid.uuid4()).replace('-', '')[0:16] for _ in range(exp_df.shape[0])]\n",
    "exp_df['src_country'] = 'UKR'\n",
    "exp_df['iesign'] = 'E'\n",
    "exp_df['version'] = '202501'\n",
    "\n",
    "# logger.info(f'table columns: {list(exp_df.columns.values)}')\n",
    "logger.info(f'table shape: {exp_df.shape}')\n",
    "\n",
    "exp_df.to_csv('dataset/data_20250724/UA_EXPORT_src_202501.csv'\n",
    "              , sep=','\n",
    "              , index=False\n",
    "              , quotechar='\"'         # 双引号导引符\n",
    "              , quoting=csv.QUOTE_ALL        # 强制所有字段加引号\n",
    "              , lineterminator='\\n', header=True, encoding='utf-8')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "1d50af54-a9b8-4d5f-8d06-f6d61c120e39",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2025-09-10 02:31:57,869 - Data Mapping - INFO - imp_df columns: ['EXPORT_DECLARATION_TYPE_1' 'EXPORT_DECLARATION_TYPE_2'\n",
      " 'EXPORT_DECLARATION_TYPE_3' 'EXPORT_DECLARATION_TYPE_4'\n",
      " 'EXPORT_DECLARATION_NUMBER_1' 'EXPORT_DECLARATION_NUMBER_2'\n",
      " 'EXPORT_DECLARATION_NUMBER_3' 'REG_DATE' 'EXPORT_CLEARANCE_CUSTOMS'\n",
      " 'EXPORT_CLEARANCE_POST' 'TRADE_COUNTRY' 'COUNTRY OF SHIPPING'\n",
      " 'COUNTRY_OF_DESTINATION'\n",
      " 'INDICATOR_OF_GOODS_IN_CONTAINER (K = IN_CONTAINER, EMPTY = NOT_IN_CONTAINER)'\n",
      " 'CONTAINER_NUMBER' 'DELIVERY_TERMS_INCOTERMS' 'DELIVERY_ADDRESS'\n",
      " 'CONTRACT_CURRENCY_CODE' 'CONTRACT_CURRENCY' 'TRANSPORT_CODE_AT_BORDER'\n",
      " 'TRANSPORT_AT_BORDER' 'TRANSPORT_NUMBER_AT_BORDER'\n",
      " 'TRANSPORT_CODE_INSIDE_COUNTRY' 'TRANSPORT_INSIDE_COUNTRY'\n",
      " 'CUSTOMS_CODE_AT_BORDER' 'CUSTOMS_AT_BORDER' 'CUSTOM POST_AT_BORDER'\n",
      " 'GOODS_NUMBER' 'GOODS_HS_CODE' 'GOODS_NAME'\n",
      " 'REGISTRATION_CODE_OF_SENDER_COMPANY' 'SENDER_COMPANY_NAME'\n",
      " 'EXPORT_DECLARATION_BACKSIDE_1' 'EXPORT_DECLARATION_BACKSIDE_2'\n",
      " 'EXPORT_DECLARATION_BACKSIDE_3' 'RECIPIENT_COMPANY_NAME'\n",
      " 'REGISTRATION_CODE_OF_CONTRACT_HOLDER_COMPANY'\n",
      " 'CONTRACT_HOLDER_COMPANY_NAME' 'METHOD_OF_CUSTOMS_VALUE_DETERMINATION'\n",
      " 'QUANTITY_IN_ADDITIONAL_UNIT_OF_MEASURE' 'ADDITIONAL_UNIT_OF_MEASURE'\n",
      " 'GROSS_WEIGHT_KG' 'NET_WEIGHT_KG' 'INVOICE_VALUE_UAH' 'INVOICE_VALUE_USD'\n",
      " 'NBU_EXCHANGE_RATE_USD' 'CUSTOMS_VALUE_UAH' 'CUSTOMS_VALUE_USD'\n",
      " 'CUSTOMS_VALUE_USD_KG' 'TAX_UAH' 'dataid' 'src_country' 'iesign'\n",
      " 'version']\n",
      "2025-09-10 02:31:58,098 - Data Mapping - INFO - Export data mapping and cleaning takes 0.02 minutes\n",
      "2025-09-10 02:31:59,550 - Data Goverance - INFO - UKraine 01mth export deal takes 0.6296241283416748 minutes\n"
     ]
    }
   ],
   "source": [
    "func_map = [x for x in function_map if x.get('中文国名')=='乌克兰']\n",
    "exp_path = 'dataset/data_20250724/UA_EXPORT_src_202501.csv'\n",
    "src_cols = exp_df.columns.values\n",
    "\n",
    "target_tab = func_map[0].get('export_fuc2')(exp_path, src_cols)\n",
    "# target_tab.shape\n",
    "target_tab.to_csv('dataset/data_20250724/UA_EXPORT_202501.csv'\n",
    "                  , sep=',', index=False, header=True, encoding='utf-8')\n",
    "\n",
    "end = time.time()\n",
    "logger.info(f'UKraine 01mth export deal takes {(end-start)/60} minutes')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "ca49cd3f-9f87-4f72-a9b4-90f16faf0ee2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将结果移动到指定位置\n",
    "import shutil\n",
    "\n",
    "src_file='dataset/data_20250724/UA_EXPORT_202501.csv'\n",
    "dest_dir='dataset/data_20250724/20251t3_csv_save2/乌克兰/EXPORT/UA_EXPORT_202501.csv'\n",
    "\n",
    "shutil.copy2(src_file, dest_dir) \n",
    "os.remove(src_file)\n",
    "del src_file, dest_dir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "67f0cb6c-729f-4c7f-88f3-384fe98314e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !mv dataset/data_20250724/UA_EXPORT_202501.csv dataset/data_20250724/20251t3_csv_save2/乌克兰/EXPORT/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dd0ebb1d-f330-433a-8ae0-7efdeef06584",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "965e6f87-8153-468f-a4d2-f39ea7c658dc",
   "metadata": {},
   "source": [
    "# 6 利用LLM做国家标准化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "05543fd3-28a2-42d9-8d3b-dd29b4ec222e",
   "metadata": {},
   "outputs": [],
   "source": [
    "country_list = ['智利', '坦桑尼亚', '乌克兰']\n",
    "# country_list = ['智利']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "c7f98279-5124-45b0-82af-9a0f4d94d23c",
   "metadata": {},
   "outputs": [],
   "source": [
    "csv_files_path = Path('dataset/data_20250724/20251t3_csv_save2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "dd2117d8-fe8e-484d-85a2-b00024a24b97",
   "metadata": {},
   "outputs": [],
   "source": [
    "pth_change_tag='20251t3_csv_correct2'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "23e53224-2125-43a0-b0bb-2c4c2d5f2626",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2025-09-10 02:32:00,050 - Data Goverance - INFO - country recore count is 1886\n",
      "2025-09-10 02:32:00,050 - Data Goverance - INFO - country_mapping count is 1906\n"
     ]
    }
   ],
   "source": [
    "logger.info(f'country recore count is {len(all_country_list)}')\n",
    "logger.info(f'country_mapping count is {len(country_mapping)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "595b49e9-0c7d-4ed1-b816-752bdbc5f8ec",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2025-09-10 02:32:00,055 - File Process - INFO - dataset/data_20250724/20251t3_csv_save2/坦桑尼亚/EXPORT/202501-EXP-RAW.csv\n",
      "2025-09-10 02:32:00,055 - File Process - INFO - dataset/data_20250724/20251t3_csv_save2/乌克兰/EXPORT/UA_EXPORT_202503.csv\n",
      "2025-09-10 02:32:00,055 - File Process - INFO - dataset/data_20250724/20251t3_csv_save2/智利/EXPORT/202502-EXP.csv\n",
      "2025-09-10 02:32:00,450 - File Process - INFO - correct country saved to dataset/data_20250724/20251t3_csv_correct2/坦桑尼亚/EXPORT/202501-EXP-RAW.csv\n",
      "2025-09-10 02:32:00,451 - File Process - INFO - dataset/data_20250724/20251t3_csv_save2/坦桑尼亚/EXPORT/202503-EXP-RAW.csv\n",
      "2025-09-10 02:32:01,181 - File Process - INFO - correct country saved to dataset/data_20250724/20251t3_csv_correct2/坦桑尼亚/EXPORT/202503-EXP-RAW.csv\n",
      "2025-09-10 02:32:01,183 - File Process - INFO - dataset/data_20250724/20251t3_csv_save2/坦桑尼亚/EXPORT/202503-TRANS-RAW.csv\n",
      "2025-09-10 02:32:04,350 - File Process - INFO - correct country saved to dataset/data_20250724/20251t3_csv_correct2/智利/EXPORT/202502-EXP.csv\n",
      "2025-09-10 02:32:04,356 - File Process - INFO - dataset/data_20250724/20251t3_csv_save2/智利/EXPORT/202501-EXP.csv\n",
      "2025-09-10 02:32:07,423 - File Process - INFO - correct country saved to dataset/data_20250724/20251t3_csv_correct2/坦桑尼亚/EXPORT/202503-TRANS-RAW.csv\n",
      "2025-09-10 02:32:07,429 - File Process - INFO - dataset/data_20250724/20251t3_csv_save2/坦桑尼亚/EXPORT/202502-TRANS-RAW.csv\n",
      "2025-09-10 02:32:08,065 - File Process - INFO - correct country saved to dataset/data_20250724/20251t3_csv_correct2/智利/EXPORT/202501-EXP.csv\n",
      "2025-09-10 02:32:08,073 - File Process - INFO - dataset/data_20250724/20251t3_csv_save2/智利/EXPORT/202503-EXP.csv\n",
      "2025-09-10 02:32:11,017 - File Process - INFO - correct country saved to dataset/data_20250724/20251t3_csv_correct2/乌克兰/EXPORT/UA_EXPORT_202503.csv\n",
      "2025-09-10 02:32:11,021 - File Process - INFO - dataset/data_20250724/20251t3_csv_save2/乌克兰/EXPORT/UA_EXPORT_202501.csv\n",
      "2025-09-10 02:32:11,506 - File Process - INFO - correct country saved to dataset/data_20250724/20251t3_csv_correct2/智利/EXPORT/202503-EXP.csv\n",
      "2025-09-10 02:32:11,509 - File Process - INFO - dataset/data_20250724/20251t3_csv_save2/智利/IMPORT/202501-IMP.csv\n",
      "2025-09-10 02:32:12,369 - File Process - INFO - correct country saved to dataset/data_20250724/20251t3_csv_correct2/坦桑尼亚/EXPORT/202502-TRANS-RAW.csv\n",
      "2025-09-10 02:32:12,370 - File Process - INFO - dataset/data_20250724/20251t3_csv_save2/坦桑尼亚/EXPORT/202501-TRANS-RAW.csv\n",
      "2025-09-10 02:32:19,709 - File Process - INFO - correct country saved to dataset/data_20250724/20251t3_csv_correct2/坦桑尼亚/EXPORT/202501-TRANS-RAW.csv\n",
      "2025-09-10 02:32:19,713 - File Process - INFO - dataset/data_20250724/20251t3_csv_save2/坦桑尼亚/EXPORT/202502-EXP-RAW.csv\n",
      "2025-09-10 02:32:20,000 - File Process - INFO - correct country saved to dataset/data_20250724/20251t3_csv_correct2/坦桑尼亚/EXPORT/202502-EXP-RAW.csv\n",
      "2025-09-10 02:32:20,005 - File Process - INFO - dataset/data_20250724/20251t3_csv_save2/坦桑尼亚/IMPORT/202501-IMP-RAW.csv\n",
      "2025-09-10 02:32:21,796 - File Process - INFO - correct country saved to dataset/data_20250724/20251t3_csv_correct2/乌克兰/EXPORT/UA_EXPORT_202501.csv\n",
      "2025-09-10 02:32:21,870 - File Process - INFO - dataset/data_20250724/20251t3_csv_save2/乌克兰/EXPORT/UA_EXPORT_202502.csv\n",
      "2025-09-10 02:32:27,843 - File Process - INFO - correct country saved to dataset/data_20250724/20251t3_csv_correct2/智利/IMPORT/202501-IMP.csv\n",
      "2025-09-10 02:32:27,853 - File Process - INFO - dataset/data_20250724/20251t3_csv_save2/智利/IMPORT/202502-IMP.csv\n",
      "2025-09-10 02:32:32,852 - File Process - INFO - correct country saved to dataset/data_20250724/20251t3_csv_correct2/乌克兰/EXPORT/UA_EXPORT_202502.csv\n",
      "2025-09-10 02:32:32,856 - File Process - INFO - dataset/data_20250724/20251t3_csv_save2/乌克兰/IMPORT/UA_IMPORT_202501.csv\n",
      "2025-09-10 02:32:34,528 - File Process - INFO - correct country saved to dataset/data_20250724/20251t3_csv_correct2/坦桑尼亚/IMPORT/202501-IMP-RAW.csv\n",
      "2025-09-10 02:32:34,542 - File Process - INFO - dataset/data_20250724/20251t3_csv_save2/坦桑尼亚/IMPORT/202503-IMP-RAW.csv\n",
      "2025-09-10 02:32:42,383 - File Process - INFO - correct country saved to dataset/data_20250724/20251t3_csv_correct2/智利/IMPORT/202502-IMP.csv\n",
      "2025-09-10 02:32:42,496 - File Process - INFO - dataset/data_20250724/20251t3_csv_save2/智利/IMPORT/202503-IMP.csv\n",
      "2025-09-10 02:32:51,778 - File Process - INFO - correct country saved to dataset/data_20250724/20251t3_csv_correct2/坦桑尼亚/IMPORT/202503-IMP-RAW.csv\n",
      "2025-09-10 02:32:51,783 - File Process - INFO - dataset/data_20250724/20251t3_csv_save2/坦桑尼亚/IMPORT/202502-IMP-RAW.csv\n",
      "2025-09-10 02:32:59,352 - File Process - INFO - correct country saved to dataset/data_20250724/20251t3_csv_correct2/智利/IMPORT/202503-IMP.csv\n",
      "2025-09-10 02:32:59,354 - File Process - INFO - Correct country takes 59 seconds\n",
      "2025-09-10 02:32:59,411 - Data Goverance - INFO - 1\n",
      "2025-09-10 02:33:00,763 - File Process - INFO - correct country saved to dataset/data_20250724/20251t3_csv_correct2/坦桑尼亚/IMPORT/202502-IMP-RAW.csv\n",
      "2025-09-10 02:33:00,764 - File Process - INFO - Correct country takes 61 seconds\n",
      "2025-09-10 02:33:00,810 - Data Goverance - INFO - 1\n",
      "2025-09-10 02:33:08,195 - File Process - INFO - correct country saved to dataset/data_20250724/20251t3_csv_correct2/乌克兰/IMPORT/UA_IMPORT_202501.csv\n",
      "2025-09-10 02:33:08,198 - File Process - INFO - dataset/data_20250724/20251t3_csv_save2/乌克兰/IMPORT/UA_IMPORT_202503.csv\n",
      "2025-09-10 02:33:24,456 - File Process - INFO - correct country saved to dataset/data_20250724/20251t3_csv_correct2/乌克兰/IMPORT/UA_IMPORT_202503.csv\n",
      "2025-09-10 02:33:24,458 - File Process - INFO - dataset/data_20250724/20251t3_csv_save2/乌克兰/IMPORT/UA_IMPORT_202502.csv\n",
      "2025-09-10 02:33:38,752 - File Process - INFO - correct country saved to dataset/data_20250724/20251t3_csv_correct2/乌克兰/IMPORT/UA_IMPORT_202502.csv\n",
      "2025-09-10 02:33:38,754 - File Process - INFO - Correct country takes 99 seconds\n",
      "2025-09-10 02:33:38,888 - Data Goverance - INFO - 1\n",
      "2025-09-10 02:33:38,893 - Data Goverance - INFO - country_normal_mapping takes 1.6473256468772888 minutes\n"
     ]
    }
   ],
   "source": [
    "# 更新国家 - 多线程\n",
    "start = time.time()\n",
    "\n",
    "params_list = [(country_mapping, [ctry], csv_files_path, pth_change_tag) for ctry in country_list]\n",
    "with ThreadPoolExecutor(max_workers=10) as executor:\n",
    "    # 提交多个任务\n",
    "    futures = [\n",
    "        executor.submit(lambda p: country_normal_mapping(*p), params)\n",
    "        for params in params_list\n",
    "    ]\n",
    "    # 按完成顺序获取结果\n",
    "    for future in as_completed(futures):\n",
    "        logger.info(future.result())\n",
    "\n",
    "end = time.time()\n",
    "logger.info(f'country_normal_mapping takes {(end-start)/60} minutes')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "cf8c4ff1-a093-4a03-861d-aec9be1f4df0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# # 更新国家 - 单线程\n",
    "# country_normal_mapping(country_mapping, country_list, csv_files_path, pth_change_tag='20251t3_csv_correct')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b7260d38-389a-4cee-9869-84eb787be0a2",
   "metadata": {},
   "source": [
    "# 7 货币单位校正"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1ad9a8c9-7fa3-4e36-a246-251a9963494d",
   "metadata": {},
   "source": [
    "## 7.3 替换货币"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "b5e6e228-1659-402b-a412-51c9d58bd4fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "currency_on_usd_dict=currency_on_usd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "5f2443fd-3149-403c-8cc5-c5b1c6d6806f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from_dir=Path('dataset/data_20250724/20251t3_csv_correct2')\n",
    "to_dir=Path('dataset/data_20250724/20251t3_csv_correct3')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "4256e304-0aad-434e-b472-6920160036eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "country_list = ['智利', '坦桑尼亚', '乌克兰']\n",
    "# country_list = ['智利']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "545b8a69-505f-41b5-aa95-9539bdf1b3ac",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# replace_currency_to_usd(country_list, currency_on_usd_dict, from_dir, to_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "a33676ca-f163-448f-9cf7-4ccb502d1f9b",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2025-09-10 02:33:38,920 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct2/智利/EXPORT/202502-EXP.csv\n",
      "2025-09-10 02:33:38,920 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct2/乌克兰/EXPORT/UA_EXPORT_202503.csv\n",
      "2025-09-10 02:33:38,920 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct2/坦桑尼亚/EXPORT/202501-EXP-RAW.csv\n",
      "2025-09-10 02:33:39,295 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202501-EXP-RAW.csv\n",
      "2025-09-10 02:33:39,295 - File Process - INFO - currency transform done, the data shape is (9990, 47)\n",
      "2025-09-10 02:33:39,296 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct2/坦桑尼亚/EXPORT/202503-EXP-RAW.csv\n",
      "2025-09-10 02:33:39,917 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202503-EXP-RAW.csv\n",
      "2025-09-10 02:33:39,920 - File Process - INFO - currency transform done, the data shape is (7761, 47)\n",
      "2025-09-10 02:33:39,921 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct2/坦桑尼亚/EXPORT/202503-TRANS-RAW.csv\n",
      "2025-09-10 02:33:42,807 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/智利/EXPORT/202502-EXP.csv\n",
      "2025-09-10 02:33:42,816 - File Process - INFO - currency transform done, the data shape is (98419, 47)\n",
      "2025-09-10 02:33:42,817 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct2/智利/EXPORT/202501-EXP.csv\n",
      "2025-09-10 02:33:45,040 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202503-TRANS-RAW.csv\n",
      "2025-09-10 02:33:45,045 - File Process - INFO - currency transform done, the data shape is (133800, 47)\n",
      "2025-09-10 02:33:45,046 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct2/坦桑尼亚/EXPORT/202502-TRANS-RAW.csv\n",
      "2025-09-10 02:33:46,794 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/智利/EXPORT/202501-EXP.csv\n",
      "2025-09-10 02:33:46,799 - File Process - INFO - currency transform done, the data shape is (127496, 47)\n",
      "2025-09-10 02:33:46,801 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct2/智利/EXPORT/202503-EXP.csv\n",
      "2025-09-10 02:33:48,098 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/乌克兰/EXPORT/UA_EXPORT_202503.csv\n",
      "2025-09-10 02:33:48,100 - File Process - INFO - currency transform done, the data shape is (141342, 47)\n",
      "2025-09-10 02:33:48,101 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct2/乌克兰/EXPORT/UA_EXPORT_202501.csv\n",
      "2025-09-10 02:33:49,955 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202502-TRANS-RAW.csv\n",
      "2025-09-10 02:33:49,957 - File Process - INFO - currency transform done, the data shape is (129629, 47)\n",
      "2025-09-10 02:33:49,961 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct2/坦桑尼亚/EXPORT/202501-TRANS-RAW.csv\n",
      "2025-09-10 02:33:50,214 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/智利/EXPORT/202503-EXP.csv\n",
      "2025-09-10 02:33:50,216 - File Process - INFO - currency transform done, the data shape is (100956, 47)\n",
      "2025-09-10 02:33:50,219 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct2/智利/IMPORT/202501-IMP.csv\n",
      "2025-09-10 02:33:56,148 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202501-TRANS-RAW.csv\n",
      "2025-09-10 02:33:56,149 - File Process - INFO - currency transform done, the data shape is (152034, 47)\n",
      "2025-09-10 02:33:56,150 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct2/坦桑尼亚/EXPORT/202502-EXP-RAW.csv\n",
      "2025-09-10 02:33:56,406 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202502-EXP-RAW.csv\n",
      "2025-09-10 02:33:56,407 - File Process - INFO - currency transform done, the data shape is (7660, 47)\n",
      "2025-09-10 02:33:56,409 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct2/坦桑尼亚/IMPORT/202501-IMP-RAW.csv\n",
      "2025-09-10 02:33:57,217 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/乌克兰/EXPORT/UA_EXPORT_202501.csv\n",
      "2025-09-10 02:33:57,220 - File Process - INFO - currency transform done, the data shape is (115577, 47)\n",
      "2025-09-10 02:33:57,222 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct2/乌克兰/EXPORT/UA_EXPORT_202502.csv\n",
      "2025-09-10 02:34:04,525 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/智利/IMPORT/202501-IMP.csv\n",
      "2025-09-10 02:34:04,531 - File Process - INFO - currency transform done, the data shape is (383012, 47)\n",
      "2025-09-10 02:34:04,532 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct2/智利/IMPORT/202502-IMP.csv\n",
      "2025-09-10 02:34:05,213 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/乌克兰/EXPORT/UA_EXPORT_202502.csv\n",
      "2025-09-10 02:34:05,214 - File Process - INFO - currency transform done, the data shape is (130121, 47)\n",
      "2025-09-10 02:34:05,218 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct2/乌克兰/IMPORT/UA_IMPORT_202501.csv\n",
      "2025-09-10 02:34:10,031 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/IMPORT/202501-IMP-RAW.csv\n",
      "2025-09-10 02:34:10,043 - File Process - INFO - currency transform done, the data shape is (432559, 47)\n",
      "2025-09-10 02:34:10,044 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct2/坦桑尼亚/IMPORT/202503-IMP-RAW.csv\n",
      "2025-09-10 02:34:20,205 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/智利/IMPORT/202502-IMP.csv\n",
      "2025-09-10 02:34:20,208 - File Process - INFO - currency transform done, the data shape is (354974, 47)\n",
      "2025-09-10 02:34:20,210 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct2/智利/IMPORT/202503-IMP.csv\n",
      "2025-09-10 02:34:22,079 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/IMPORT/202503-IMP-RAW.csv\n",
      "2025-09-10 02:34:22,094 - File Process - INFO - currency transform done, the data shape is (345735, 47)\n",
      "2025-09-10 02:34:22,099 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct2/坦桑尼亚/IMPORT/202502-IMP-RAW.csv\n",
      "2025-09-10 02:34:33,377 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/IMPORT/202502-IMP-RAW.csv\n",
      "2025-09-10 02:34:33,377 - File Process - INFO - currency transform done, the data shape is (359832, 47)\n",
      "2025-09-10 02:34:33,378 - File Process - INFO - replace currency to USD takes 54 seconds\n",
      "2025-09-10 02:34:33,412 - Data Goverance - INFO - 1\n",
      "2025-09-10 02:34:33,487 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/智利/IMPORT/202503-IMP.csv\n",
      "2025-09-10 02:34:33,488 - File Process - INFO - currency transform done, the data shape is (385873, 47)\n",
      "2025-09-10 02:34:33,489 - File Process - INFO - replace currency to USD takes 55 seconds\n",
      "2025-09-10 02:34:33,531 - Data Goverance - INFO - 1\n",
      "2025-09-10 02:34:40,099 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/乌克兰/IMPORT/UA_IMPORT_202501.csv\n",
      "2025-09-10 02:34:40,100 - File Process - INFO - currency transform done, the data shape is (652325, 47)\n",
      "2025-09-10 02:34:40,100 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct2/乌克兰/IMPORT/UA_IMPORT_202503.csv\n",
      "2025-09-10 02:34:55,246 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/乌克兰/IMPORT/UA_IMPORT_202503.csv\n",
      "2025-09-10 02:34:55,246 - File Process - INFO - currency transform done, the data shape is (771565, 47)\n",
      "2025-09-10 02:34:55,247 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct2/乌克兰/IMPORT/UA_IMPORT_202502.csv\n",
      "2025-09-10 02:35:08,835 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/乌克兰/IMPORT/UA_IMPORT_202502.csv\n",
      "2025-09-10 02:35:08,835 - File Process - INFO - currency transform done, the data shape is (675826, 47)\n",
      "2025-09-10 02:35:08,835 - File Process - INFO - replace currency to USD takes 90 seconds\n",
      "2025-09-10 02:35:08,910 - Data Goverance - INFO - 1\n",
      "2025-09-10 02:35:08,918 - Data Goverance - INFO - replace_currency_to_usd takes 1.4999861001968384 minutes\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "\n",
    "params_list = [([ctry], currency_on_usd_dict, from_dir, to_dir) for ctry in country_list]\n",
    "with ThreadPoolExecutor(max_workers=10) as executor:\n",
    "    # 提交多个任务\n",
    "    futures = [\n",
    "        executor.submit(lambda p: replace_currency_to_usd(*p), params)\n",
    "        for params in params_list\n",
    "    ]\n",
    "    # 按完成顺序获取结果\n",
    "    for future in as_completed(futures):\n",
    "        logger.info(future.result())\n",
    "\n",
    "end = time.time()\n",
    "logger.info(f'replace_currency_to_usd takes {(end-start)/60} minutes')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b5ef8b8-72ad-4f92-8d1b-c9b238f7e79d",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "a7c43cfa-c44f-4495-9c27-7838f6ef706e",
   "metadata": {},
   "source": [
    "## 7.4 结果检查"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "4fbbbddc-4a16-431f-8075-c5f3d8b201ab",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# !head -3 dataset/data_20250724/20251t3_csv_correct3/印度/EXPORT/INDIA_EXPORT_202501_2106.csv\n",
    "# !head -10 dataset/data_20250724/20251t3_csv_correct3/纳米比亚/IMPORT/202501-IMP-FULL.csv\n",
    "# !head -3 dataset/data_20250724/20251t3_csv_correct1/坦桑尼亚/EXPORT/202504-EXP-RAW.csv\n",
    "\n",
    "# !head -3 dataset/data_20250724/20251t3_csv_correct1/乌克兰/EXPORT/UA_EXPORT_202504.csv\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "52a03cba-5b36-491c-8db8-846680f20882",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# # df = pd.read_csv('dataset/data_20250724/20251t3_csv_correct3/智利/IMPORT/202501-IMP.csv')\n",
    "# df = pd.read_csv('dataset/data_20250724/20251t3_csv_correct3/菲律宾/IMPORT/PH_IMPORT_202501.csv')\n",
    "# df = pd.read_csv('dataset/data_20250724/20251t3_csv_correct3/纳米比亚/IMPORT/202501-IMP-FULL.csv')\n",
    "# df = pd.read_csv('dataset/data_20250724/20251t3_csv_save2/纳米比亚/IMPORT/202501-IMP-FULL.csv')\n",
    "\n",
    "# df = pd.read_csv('dataset/data_20250724/20251t3_csv_correct3/科特迪瓦/IMPORT/KT_IMPORT_202501.csv')\n",
    "# df = pd.read_csv('dataset/data_20250724/20251t3_csv_correct3/印度/IMPORT/INDIA_IMPORT_202501_2105A.csv')\n",
    "# df = pd.read_csv('dataset/data_20250724/20251t3_csv_correct3/印度/EXPORT/INDIA_EXPORT_202501_2106.csv')\n",
    "\n",
    "# df.iloc[:, 0:20]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "51d9eaf7-cc19-445d-b8c3-30cf67837953",
   "metadata": {},
   "source": [
    "# 8 特殊处理-坦桑尼亚/乌克兰/智利"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "563f7102-f356-449b-b584-612237ec9253",
   "metadata": {},
   "source": [
    "## 8.2 '智利', '坦桑尼亚', '乌克兰' transportterm统一修正"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "a84bea2f-8e0a-49b9-9875-dd51c6e74d51",
   "metadata": {},
   "outputs": [],
   "source": [
    "# transtype_mapping_dict = {\n",
    "#     'AEREO': 'AIR',  # 西班牙语\"航空\"，匹配 AIR（航空运输）\n",
    "#     'AIR TRANSPORT': 'AIR',  # 英语\"航空运输\"，直接匹配 AIR\n",
    "#     'CARRETERO': 'ROAD',  # 西班牙语\"公路\"，匹配 ROAD（公路运输）\n",
    "#     'INLAND WATERWAYS TRANSPORT': 'INLAND WATER',  # 英语\"内河运输\"，直接匹配 INLAND WATER（内河运输）\n",
    "#     'MARITIMA': 'SEA',  # 西班牙语\"海洋\"，匹配 SEA（海洋运输）\n",
    "#     'MULTIMODAL TRANSPORT': 'MULTI-MODAL',  # 英语\"多式联运\"，直接匹配 MULTI-MODAL（多式联运）\n",
    "#     'OLEODUCTOS': 'PIPELINE',  # 西班牙语\"管道\"，匹配 PIPELINE（管道运输）\n",
    "#     'OTRO': 'OTHERS',  # 西班牙语\"其他\"，匹配 OTHERS（其他）\n",
    "#     'OTROS': 'OTHERS',  # 西班牙语\"其他\"（复数），匹配 OTHERS\n",
    "#     'POSTAL TRANSPORT': 'MAIL',  # 英语\"邮政运输\"，直接匹配 MAIL（邮政运输）\n",
    "#     'RAIL TRANSPORT': 'RAILWAY',  # 英语\"铁路运输\"，直接匹配 RAILWAY（铁路运输）\n",
    "#     'ROAD TRANSPORT': 'ROAD',  # 英语\"公路运输\"，直接匹配 ROAD\n",
    "#     'SEA TRANSPORT': 'SEA',  # 英语\"海洋运输\"，直接匹配 SEA\n",
    "#     'TENDIDO ELECTRICO (AEREO,SUBT)': 'OTHERS',  # 西班牙语\"电力线路（空中、地下）\"，不属于标准货物运输方式，归入 OTHERS\n",
    "#     'TRANSPORT ON FIXED INSTALLATION': 'PIPELINE',  # 英语\"固定设施上的运输\"，部分匹配 PIPELINE（管道是固定设施的一种）\n",
    "#     'UNKNOWN': 'OTHERS',  # 英语\"未知\"，匹配 OTHERS\n",
    "#     'АВТОТРАНСПОРТ СВОЇМ ХОДОМ ЯК ТОВАР': 'ROAD',  # 乌克兰语\"作为货物运输的汽车（用自己的动力）\"，匹配 ROAD（公路运输）\n",
    "#     'ВАНТАЖНИЙ АВТОМОБІЛЬ': 'ROAD',  # 乌克兰语\"卡车\"，匹配 ROAD\n",
    "#     'ВАНТАЖНИЙ АВТОМОБІЛЬ НА МОРСЬКОМУ СУДНІ': 'SEA',  # 乌克兰语\"在船上的卡车\"，运输方式为海运，匹配 SEA\n",
    "#     'ЗАЛІЗНИЧНИЙ ВАГОН': 'RAILWAY',  # 乌克兰语\"铁路车厢\"，匹配 RAILWAY\n",
    "#     'КОНТЕЙНЕР НА ВАНТАЖНОМУ АВТОМОБІЛІ': 'ROAD',  # 乌克兰语\"在卡车上的集装箱\"，运输方式为公路，匹配 ROAD\n",
    "#     'КОНТЕЙНЕР НА ЗАЛІЗНИЧНОМУ ВАГОНІ': 'RAILWAY',  # 乌克兰语\"在铁路车厢上的集装箱\"，匹配 RAILWAY\n",
    "#     'КОНТЕЙНЕР НА МОРСЬКОМУ СУДНІ': 'SEA',  # 乌克兰语\"在船上的集装箱\"，匹配 SEA\n",
    "#     'МОРСЬКЕ СУДНО': 'SEA',  # 乌克兰语\"海船\"，匹配 SEA\n",
    "#     'НЕВІДОМИЙ': 'OTHERS',  # 乌克兰语\"未知\"，匹配 OTHERS\n",
    "#     'ПОШТОВЕ ВІДПРАВЛЕННЯ АВТОТРАНСПОРТОМ': 'MAIL',  # 乌克兰语\"汽车运输的邮政发送\"，运输类型为邮政，匹配 MAIL\n",
    "#     'ТРУБОПРОВІДНИЙ ТРАНСПОРТ': 'PIPELINE'  # 乌克兰语\"管道运输\"，直接匹配 PIPELINE\n",
    "# }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "fd9082d5-8efe-42c7-b303-4cb0b93dda75",
   "metadata": {},
   "outputs": [],
   "source": [
    "transtype_mapping_dict = {\n",
    "    \"VESSEL, CONTAINERIZED\": \"SEA\",\n",
    "    \"L\": \"LAND\",\n",
    "    \"NOT DECLARED\": \"OTHERS\",\n",
    "    \"PIPELINE\": \"PIPELINE\",\n",
    "    \"AERIAL\": \"AIR\",\n",
    "    \"AIR\": \"AIR\",\n",
    "    \"LAND - OTHER\": \"LAND\",\n",
    "    \"OLEODUCTOS\": \"PIPELINE\",\n",
    "    \"АВТОТРАНСПОРТ СВОЇМ ХОДОМ ЯК ТОВАР\": \"ROAD\",\n",
    "    \"ВАНТАЖНИЙ АВТОМОБІЛЬ\": \"ROAD\",\n",
    "    \"MARITIMO\": \"SEA\",\n",
    "    \"RAI\": \"RAILWAY\",\n",
    "    \"ЗАЛІЗНИЧНИЙ ВАГОН\": \"RAILWAY\",\n",
    "    \"MARITIMA\": \"SEA\",\n",
    "    \"LCS-SEA\": \"SEA\",\n",
    "    \"VESSEL, NON-CONTAINER\": \"SEA\",\n",
    "    \"TRANSPORT PAR AIR\": \"AIR\",\n",
    "    \"OTRO\": \"OTHERS\",\n",
    "    \"ROAD\": \"ROAD\",\n",
    "    \"ACUÁTICA\": \"INLAND WATER\",\n",
    "    \"MARITIME TRANSPORT\": \"SEA\",\n",
    "    \"nan\": \"OTHERS\",\n",
    "    \"AÉREA\": \"AIR\",\n",
    "    \"OTHER\": \"OTHERS\",\n",
    "    \"RAIL\": \"RAILWAY\",\n",
    "    \"COURIER (ADUANA AÉREA)\": \"AIR\",\n",
    "    \"R\": \"RAILWAY\",\n",
    "    \"VESSEL\": \"SEA\",\n",
    "    \"FERROVIARIA\": \"RAILWAY\",\n",
    "    \"ВАНТАЖНИЙ АВТОМОБІЛЬ НА МОРСЬКОМУ СУДНІ\": \"SEA\",\n",
    "    \"TENDIDO ELECTRICO (AEREO,SUBT)\": \"OTHERS\",\n",
    "    \"BY SEA\": \"SEA\",\n",
    "    \"MARÃ\\xadTIMO\": \"SEA\",\n",
    "    \"MARÍTIMO\": \"SEA\",\n",
    "    \"ПОШТОВЕ ВІДПРАВЛЕННЯ АВТОТРАНСПОРТОМ\": \"MAIL\",\n",
    "    \"ENCOMIENDA\": \"MAIL\",\n",
    "    \"INSTAL FIJAS\": \"OTHERS\",\n",
    "    \"AEREA\": \"AIR\",\n",
    "    \"LCS-ROAD\": \"ROAD\",\n",
    "    \"CONDUCTOR ELÉCTRICO\": \"OTHERS\",\n",
    "    \"AEREO\": \"AIR\",\n",
    "    \"SEA\": \"SEA\",\n",
    "    \"FLUVIAL\": \"INLAND WATER\",\n",
    "    \"MARITIME\": \"SEA\",\n",
    "    \"OWN MEANS\": \"OTHERS\",\n",
    "    \"O\": \"OTHERS\",\n",
    "    \"ТРУБОПРОВІДНИЙ ТРАНСПОРТ\": \"PIPELINE\",\n",
    "    \"ICD\": \"OTHERS\",\n",
    "    \"КОНТЕЙНЕР НА ВАНТАЖНОМУ АВТОМОБІЛІ\": \"ROAD\",\n",
    "    \"LAND - RAILWAY\": \"RAILWAY\",\n",
    "    \"TUBERIAS\": \"PIPELINE\",\n",
    "    \"TERRESTRIAL\": \"LAND\",\n",
    "    \"A\": \"AIR\",\n",
    "    \"OTROS\": \"OTHERS\",\n",
    "    \"S\": \"SEA\",\n",
    "    \"AIR TRANSPORT\": \"AIR\",\n",
    "    \"AÉREO\": \"AIR\",\n",
    "    \"ROAD TRANSPORT\": \"ROAD\",\n",
    "    \"AERIEN\": \"AIR\",\n",
    "    \"CORREO\": \"MAIL\",\n",
    "    \"AÃ©REA\": \"AIR\",\n",
    "    \"CONDUCTOR ELÃ©CTRICO\": \"OTHERS\",\n",
    "    \"TRANSPORT PAR ROUTE\": \"ROAD\",\n",
    "    \"RAIL TRANSPORT\": \"RAILWAY\",\n",
    "    \"LCS\": \"OTHERS\",\n",
    "    \"ICD-SEA\": \"SEA\",\n",
    "    \"ACUÃ¡TICA\": \"INLAND WATER\",\n",
    "    \"AÃ©REO\": \"AIR\",\n",
    "    \"МОРСЬКЕ СУДНО\": \"SEA\",\n",
    "    \"ICD-ROAD\": \"ROAD\",\n",
    "    \"TRUCK\": \"ROAD\",\n",
    "    \"TRANSPORT MARITIME\": \"SEA\",\n",
    "    \"DUCTOS\": \"PIPELINE\",\n",
    "    \"КОНТЕЙНЕР НА ЗАЛІЗНИЧНОМУ ВАГОНІ\": \"RAILWAY\",\n",
    "    \"POSTAL\": \"MAIL\",\n",
    "    \"TERRESTRE\": \"LAND\",\n",
    "    \"OTRAS VIAS\": \"OTHERS\",\n",
    "    \"CARRETERA\": \"ROAD\",\n",
    "    \"CARRETERO\": \"ROAD\",\n",
    "    \"НЕВІДОМИЙ\": \"OTHERS\",\n",
    "    \"ARREO\": \"OTHERS\",\n",
    "    \"КОНТЕЙНЕР НА МОРСЬКОМУ СУДНІ\": \"SEA\",\n",
    "    \"OTHERS\": \"OTHERS\",\n",
    "    \"UNKNOWN\": \"OTHERS\", # 未知运输方式，归类为其他\n",
    "    \"TRANSPORT ON FIXED INSTALLATION\": \"OTHERS\", # 固定设施运输，无直接对应标准类型，归类为其他\n",
    "    \"INLAND WATERWAYS TRANSPORT\": \"INLAND WATER\", # 内河运输，直接匹配 INLAND WATER（内河运输）\n",
    "    \"SEA TRANSPORT\": \"SEA\", # 海洋运输，直接匹配 SEA（海洋运输）\n",
    "    \"MULTIMODAL TRANSPORT\": \"MULTI-MODAL\", # 多式联运，直接匹配 MULTI-MODAL（多式联运）\n",
    "    \"POSTAL TRANSPORT\": \"MAIL\", # 邮政运输，直接匹配 MAIL（邮政运输）\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "d0f0994d-c2e1-4304-b618-083a58efa2e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "from_dir=Path('dataset/data_20250724/20251t3_csv_correct3')\n",
    "to_dir=Path('dataset/data_20250724/20251t3_csv_correct3')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "fce9a582-a16b-4247-9291-67e23a25ac9e",
   "metadata": {},
   "outputs": [],
   "source": [
    "country_list = ['智利', '坦桑尼亚', '乌克兰']\n",
    "# country_list = ['智利']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "f16a527c-340e-4644-a4b7-ed9992780e72",
   "metadata": {},
   "outputs": [],
   "source": [
    "# transtype_list = get_transtype_list(country_list, from_dir, transtype_cols = ['transportterm'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "739baf5c-4763-402c-b052-2b679abc7da5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# replace_transtype(country_list, transtype_mapping_dict, from_dir, to_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "0a8f633a-8bb6-4473-9c62-d59120941b7d",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2025-09-10 02:35:08,959 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct3/智利/EXPORT/202502-EXP.csv\n",
      "2025-09-10 02:35:08,959 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct3/乌克兰/EXPORT/UA_EXPORT_202503.csv\n",
      "2025-09-10 02:35:08,960 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202501-EXP-RAW.csv\n",
      "2025-09-10 02:35:09,328 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202501-EXP-RAW.csv\n",
      "2025-09-10 02:35:09,328 - File Process - INFO - transtype mapping done, the data shape is (9990, 45)\n",
      "2025-09-10 02:35:09,329 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202503-EXP-RAW.csv\n",
      "2025-09-10 02:35:09,881 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202503-EXP-RAW.csv\n",
      "2025-09-10 02:35:09,883 - File Process - INFO - transtype mapping done, the data shape is (7761, 45)\n",
      "2025-09-10 02:35:09,883 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202503-TRANS-RAW.csv\n",
      "2025-09-10 02:35:12,718 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/智利/EXPORT/202502-EXP.csv\n",
      "2025-09-10 02:35:12,720 - File Process - INFO - transtype mapping done, the data shape is (98419, 45)\n",
      "2025-09-10 02:35:12,725 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct3/智利/EXPORT/202501-EXP.csv\n",
      "2025-09-10 02:35:14,993 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202503-TRANS-RAW.csv\n",
      "2025-09-10 02:35:14,999 - File Process - INFO - transtype mapping done, the data shape is (133800, 45)\n",
      "2025-09-10 02:35:15,000 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202502-TRANS-RAW.csv\n",
      "2025-09-10 02:35:16,431 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/智利/EXPORT/202501-EXP.csv\n",
      "2025-09-10 02:35:16,434 - File Process - INFO - transtype mapping done, the data shape is (127496, 45)\n",
      "2025-09-10 02:35:16,439 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct3/智利/EXPORT/202503-EXP.csv\n",
      "2025-09-10 02:35:17,692 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/乌克兰/EXPORT/UA_EXPORT_202503.csv\n",
      "2025-09-10 02:35:17,693 - File Process - INFO - transtype mapping done, the data shape is (141342, 45)\n",
      "2025-09-10 02:35:17,694 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct3/乌克兰/EXPORT/UA_EXPORT_202501.csv\n",
      "2025-09-10 02:35:19,841 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202502-TRANS-RAW.csv\n",
      "2025-09-10 02:35:19,861 - File Process - INFO - transtype mapping done, the data shape is (129629, 45)\n",
      "2025-09-10 02:35:19,862 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202501-TRANS-RAW.csv\n",
      "2025-09-10 02:35:20,016 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/智利/EXPORT/202503-EXP.csv\n",
      "2025-09-10 02:35:20,021 - File Process - INFO - transtype mapping done, the data shape is (100956, 45)\n",
      "2025-09-10 02:35:20,022 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct3/智利/IMPORT/202501-IMP.csv\n",
      "2025-09-10 02:35:25,563 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/乌克兰/EXPORT/UA_EXPORT_202501.csv\n",
      "2025-09-10 02:35:25,564 - File Process - INFO - transtype mapping done, the data shape is (115577, 45)\n",
      "2025-09-10 02:35:25,566 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct3/乌克兰/EXPORT/UA_EXPORT_202502.csv\n",
      "2025-09-10 02:35:25,746 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202501-TRANS-RAW.csv\n",
      "2025-09-10 02:35:25,751 - File Process - INFO - transtype mapping done, the data shape is (152034, 45)\n",
      "2025-09-10 02:35:25,754 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202502-EXP-RAW.csv\n",
      "2025-09-10 02:35:26,054 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202502-EXP-RAW.csv\n",
      "2025-09-10 02:35:26,056 - File Process - INFO - transtype mapping done, the data shape is (7660, 45)\n",
      "2025-09-10 02:35:26,092 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/IMPORT/202501-IMP-RAW.csv\n",
      "2025-09-10 02:35:33,544 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/智利/IMPORT/202501-IMP.csv\n",
      "2025-09-10 02:35:33,545 - File Process - INFO - transtype mapping done, the data shape is (383012, 45)\n",
      "2025-09-10 02:35:33,546 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct3/智利/IMPORT/202502-IMP.csv\n",
      "2025-09-10 02:35:33,667 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/乌克兰/EXPORT/UA_EXPORT_202502.csv\n",
      "2025-09-10 02:35:33,668 - File Process - INFO - transtype mapping done, the data shape is (130121, 45)\n",
      "2025-09-10 02:35:33,669 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct3/乌克兰/IMPORT/UA_IMPORT_202501.csv\n",
      "2025-09-10 02:35:38,107 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/IMPORT/202501-IMP-RAW.csv\n",
      "2025-09-10 02:35:38,118 - File Process - INFO - transtype mapping done, the data shape is (432559, 45)\n",
      "2025-09-10 02:35:38,126 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/IMPORT/202503-IMP-RAW.csv\n",
      "2025-09-10 02:35:48,635 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/智利/IMPORT/202502-IMP.csv\n",
      "2025-09-10 02:35:48,635 - File Process - INFO - transtype mapping done, the data shape is (354974, 45)\n",
      "2025-09-10 02:35:48,636 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct3/智利/IMPORT/202503-IMP.csv\n",
      "2025-09-10 02:35:50,107 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/IMPORT/202503-IMP-RAW.csv\n",
      "2025-09-10 02:35:50,119 - File Process - INFO - transtype mapping done, the data shape is (345735, 45)\n",
      "2025-09-10 02:35:50,121 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/IMPORT/202502-IMP-RAW.csv\n",
      "2025-09-10 02:36:01,024 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/IMPORT/202502-IMP-RAW.csv\n",
      "2025-09-10 02:36:01,025 - File Process - INFO - transtype mapping done, the data shape is (359832, 45)\n",
      "2025-09-10 02:36:01,025 - File Process - INFO - transtype mapping takes 52 seconds\n",
      "2025-09-10 02:36:01,063 - Data Goverance - INFO - 1\n",
      "2025-09-10 02:36:01,273 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/智利/IMPORT/202503-IMP.csv\n",
      "2025-09-10 02:36:01,273 - File Process - INFO - transtype mapping done, the data shape is (385873, 45)\n",
      "2025-09-10 02:36:01,274 - File Process - INFO - transtype mapping takes 52 seconds\n",
      "2025-09-10 02:36:01,319 - Data Goverance - INFO - 1\n",
      "2025-09-10 02:36:07,213 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/乌克兰/IMPORT/UA_IMPORT_202501.csv\n",
      "2025-09-10 02:36:07,213 - File Process - INFO - transtype mapping done, the data shape is (652325, 45)\n",
      "2025-09-10 02:36:07,214 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct3/乌克兰/IMPORT/UA_IMPORT_202503.csv\n",
      "2025-09-10 02:36:21,844 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/乌克兰/IMPORT/UA_IMPORT_202503.csv\n",
      "2025-09-10 02:36:21,845 - File Process - INFO - transtype mapping done, the data shape is (771565, 45)\n",
      "2025-09-10 02:36:21,845 - File Process - INFO - the path : dataset/data_20250724/20251t3_csv_correct3/乌克兰/IMPORT/UA_IMPORT_202502.csv\n",
      "2025-09-10 02:36:34,786 - File Process - INFO - save path: dataset/data_20250724/20251t3_csv_correct3/乌克兰/IMPORT/UA_IMPORT_202502.csv\n",
      "2025-09-10 02:36:34,787 - File Process - INFO - transtype mapping done, the data shape is (675826, 45)\n",
      "2025-09-10 02:36:34,787 - File Process - INFO - transtype mapping takes 86 seconds\n",
      "2025-09-10 02:36:34,870 - Data Goverance - INFO - 1\n",
      "2025-09-10 02:36:34,871 - Data Goverance - INFO - replace_currency_to_usd takes 1.4318825642267863 minutes\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "\n",
    "params_list = [([ctry], transtype_mapping_dict, from_dir, to_dir) for ctry in country_list]\n",
    "with ThreadPoolExecutor(max_workers=10) as executor:\n",
    "    # 提交多个任务\n",
    "    futures = [\n",
    "        executor.submit(lambda p: replace_transtype(*p), params)\n",
    "        for params in params_list\n",
    "    ]\n",
    "    # 按完成顺序获取结果\n",
    "    for future in as_completed(futures):\n",
    "        logger.info(future.result())\n",
    "\n",
    "end = time.time()\n",
    "logger.info(f'replace_currency_to_usd takes {(end-start)/60} minutes')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eabb8d19-cc91-4a65-a9de-53a97e27d2ec",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c5b2d883-8ab2-47c3-9929-2363f93d2582",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "67874d64-9663-4948-abc0-cbc3203cf2fb",
   "metadata": {},
   "source": [
    "# 9 货代公司"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "56a91635-f835-4f22-9ef7-05a6c3eff9bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_hd = pd.read_csv('物流货代词典_20250820.csv', dtype='str')\n",
    "df_hd['物流货代'] = df_hd['物流货代'].str.upper()\n",
    "# df_hd\n",
    "\n",
    "hd_list = list(df_hd['物流货代'].values)\n",
    "\n",
    "from_dir = Path('dataset/data_20250724/20251t3_csv_correct3')\n",
    "to_dir = Path('dataset/data_20250724/20251t3_csv_correct3')\n",
    "\n",
    "country_list = ['智利', '坦桑尼亚', '乌克兰']\n",
    "# country_list = ['智利']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "040bbf6b-86b3-4495-9ab9-645ed07e3f4e",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2025-09-10 02:36:34,899 - File Process - INFO - dataset/data_20250724/20251t3_csv_correct3/智利/EXPORT/202502-EXP.csv\n",
      "2025-09-10 02:36:34,900 - File Process - INFO - dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202501-EXP-RAW.csv\n",
      "2025-09-10 02:36:34,901 - File Process - INFO - dataset/data_20250724/20251t3_csv_correct3/乌克兰/EXPORT/UA_EXPORT_202503.csv\n",
      "2025-09-10 02:36:36,509 - File Process - INFO - saved to : dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202501-EXP-RAW.csv\n",
      "2025-09-10 02:36:36,510 - File Process - INFO - dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202503-EXP-RAW.csv\n",
      "2025-09-10 02:36:38,742 - File Process - INFO - saved to : dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202503-EXP-RAW.csv\n",
      "2025-09-10 02:36:38,753 - File Process - INFO - dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202503-TRANS-RAW.csv\n",
      "2025-09-10 02:36:58,321 - File Process - INFO - saved to : dataset/data_20250724/20251t3_csv_correct3/智利/EXPORT/202502-EXP.csv\n",
      "2025-09-10 02:36:58,337 - File Process - INFO - dataset/data_20250724/20251t3_csv_correct3/智利/EXPORT/202501-EXP.csv\n",
      "2025-09-10 02:37:11,357 - File Process - INFO - saved to : dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202503-TRANS-RAW.csv\n",
      "2025-09-10 02:37:11,360 - File Process - INFO - dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202502-TRANS-RAW.csv\n",
      "2025-09-10 02:37:16,835 - File Process - INFO - saved to : dataset/data_20250724/20251t3_csv_correct3/智利/EXPORT/202501-EXP.csv\n",
      "2025-09-10 02:37:16,856 - File Process - INFO - dataset/data_20250724/20251t3_csv_correct3/智利/EXPORT/202503-EXP.csv\n",
      "2025-09-10 02:37:19,355 - File Process - INFO - saved to : dataset/data_20250724/20251t3_csv_correct3/乌克兰/EXPORT/UA_EXPORT_202503.csv\n",
      "2025-09-10 02:37:19,387 - File Process - INFO - dataset/data_20250724/20251t3_csv_correct3/乌克兰/EXPORT/UA_EXPORT_202501.csv\n",
      "2025-09-10 02:37:36,831 - File Process - INFO - saved to : dataset/data_20250724/20251t3_csv_correct3/智利/EXPORT/202503-EXP.csv\n",
      "2025-09-10 02:37:36,832 - File Process - INFO - dataset/data_20250724/20251t3_csv_correct3/智利/IMPORT/202501-IMP.csv\n",
      "2025-09-10 02:37:37,091 - File Process - INFO - saved to : dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202502-TRANS-RAW.csv\n",
      "2025-09-10 02:37:37,093 - File Process - INFO - dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202501-TRANS-RAW.csv\n",
      "2025-09-10 02:38:00,430 - File Process - INFO - saved to : dataset/data_20250724/20251t3_csv_correct3/乌克兰/EXPORT/UA_EXPORT_202501.csv\n",
      "2025-09-10 02:38:00,461 - File Process - INFO - dataset/data_20250724/20251t3_csv_correct3/乌克兰/EXPORT/UA_EXPORT_202502.csv\n",
      "2025-09-10 02:38:15,479 - File Process - INFO - saved to : dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202501-TRANS-RAW.csv\n",
      "2025-09-10 02:38:15,506 - File Process - INFO - dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202502-EXP-RAW.csv\n",
      "2025-09-10 02:38:19,431 - File Process - INFO - saved to : dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202502-EXP-RAW.csv\n",
      "2025-09-10 02:38:19,434 - File Process - INFO - dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/IMPORT/202501-IMP-RAW.csv\n",
      "2025-09-10 02:38:43,450 - File Process - INFO - saved to : dataset/data_20250724/20251t3_csv_correct3/乌克兰/EXPORT/UA_EXPORT_202502.csv\n",
      "2025-09-10 02:38:43,478 - File Process - INFO - dataset/data_20250724/20251t3_csv_correct3/乌克兰/IMPORT/UA_IMPORT_202501.csv\n",
      "2025-09-10 02:38:51,234 - File Process - INFO - saved to : dataset/data_20250724/20251t3_csv_correct3/智利/IMPORT/202501-IMP.csv\n",
      "2025-09-10 02:38:51,254 - File Process - INFO - dataset/data_20250724/20251t3_csv_correct3/智利/IMPORT/202502-IMP.csv\n",
      "2025-09-10 02:39:21,137 - File Process - INFO - saved to : dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/IMPORT/202501-IMP-RAW.csv\n",
      "2025-09-10 02:39:21,262 - File Process - INFO - dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/IMPORT/202503-IMP-RAW.csv\n",
      "2025-09-10 02:39:55,868 - File Process - INFO - saved to : dataset/data_20250724/20251t3_csv_correct3/智利/IMPORT/202502-IMP.csv\n",
      "2025-09-10 02:39:55,986 - File Process - INFO - dataset/data_20250724/20251t3_csv_correct3/智利/IMPORT/202503-IMP.csv\n",
      "2025-09-10 02:40:47,718 - File Process - INFO - saved to : dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/IMPORT/202503-IMP-RAW.csv\n",
      "2025-09-10 02:40:47,720 - File Process - INFO - dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/IMPORT/202502-IMP-RAW.csv\n",
      "2025-09-10 02:41:08,662 - File Process - INFO - saved to : dataset/data_20250724/20251t3_csv_correct3/智利/IMPORT/202503-IMP.csv\n",
      "2025-09-10 02:41:08,668 - File Process - INFO - country list ['智利'] takes 274 seconds\n",
      "2025-09-10 02:41:08,737 - Data Goverance - INFO - 1\n",
      "2025-09-10 02:41:20,356 - File Process - INFO - saved to : dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/IMPORT/202502-IMP-RAW.csv\n",
      "2025-09-10 02:41:20,357 - File Process - INFO - country list ['坦桑尼亚'] takes 285 seconds\n",
      "2025-09-10 02:41:20,392 - Data Goverance - INFO - 1\n",
      "2025-09-10 02:41:26,130 - File Process - INFO - saved to : dataset/data_20250724/20251t3_csv_correct3/乌克兰/IMPORT/UA_IMPORT_202501.csv\n",
      "2025-09-10 02:41:26,132 - File Process - INFO - dataset/data_20250724/20251t3_csv_correct3/乌克兰/IMPORT/UA_IMPORT_202503.csv\n",
      "2025-09-10 02:42:19,762 - File Process - INFO - saved to : dataset/data_20250724/20251t3_csv_correct3/乌克兰/IMPORT/UA_IMPORT_202503.csv\n",
      "2025-09-10 02:42:19,763 - File Process - INFO - dataset/data_20250724/20251t3_csv_correct3/乌克兰/IMPORT/UA_IMPORT_202502.csv\n",
      "2025-09-10 02:43:07,012 - File Process - INFO - saved to : dataset/data_20250724/20251t3_csv_correct3/乌克兰/IMPORT/UA_IMPORT_202502.csv\n",
      "2025-09-10 02:43:07,014 - File Process - INFO - country list ['乌克兰'] takes 392 seconds\n",
      "2025-09-10 02:43:07,126 - Data Goverance - INFO - 1\n",
      "2025-09-10 02:43:07,127 - Data Goverance - INFO - all comp_forwarderagent takes 6.537162788709005 minutes\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "params_list = [([ctry], hd_list, from_dir, to_dir, 0.95) for ctry in country_list]\n",
    "\n",
    "with ThreadPoolExecutor(max_workers=10) as executor:\n",
    "    # 提交多个任务\n",
    "    futures = [\n",
    "        executor.submit(lambda p: comp_forwarderagent(*p), params)\n",
    "        for params in params_list\n",
    "    ]\n",
    "    # 按完成顺序获取结果\n",
    "    for future in as_completed(futures):\n",
    "        logger.info(future.result())\n",
    "        # pass\n",
    "\n",
    "end = time.time()\n",
    "logger.info(f'all comp_forwarderagent takes {(end-start)/60} minutes')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "68e10c8f-9775-44f1-87c1-2d3163555932",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a2c0b3d8-85b4-4df1-b6ae-8805cb5e7d88",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\"channelno\",\"dataid\",\"iesign\",\"datatype\",\"writeoffflag\",\"writeoffdataid\",\"outputdate\",\"origincountrycode\",\"origincountry\",\"countrycodeofdelivery\",\"countryofdelivery\",\"importername\",\"importeraddress\",\"importercontact\",\"suppliername\",\"supplieraddress\",\"suppliercontact\",\"hscode\",\"hscodedescription\",\"commoditydescription\",\"totalcifvalue\",\"totalfobvalue\",\"grossweight\",\"netweight\",\"quantity\",\"quantityunit\",\"teu\",\"importer_forwarderagent\",\"supplier_forwarderagent\",\"abnormaldata\",\"portofloading\",\"portofdestination\",\"loadingcountrycode\",\"loadingcountry\",\"transportterm\",\"tradeterm\",\"paymentterm\",\"carrier\",\"containerno\",\"vesselname\",\"brand\",\"version\",\"country\",\"IMPORTER_ID\",\"SUPPLIER_ID\"\n",
      "\"zdzj\",\"57cec91d50424a1c\",\"E\",\"D\",\"O\",\"\",\"2025-01-03\",\"CHL\",\"CHILE\",\"PER\",\"PERU\",\"\",\"\",\"\",\"\",\"\",\"\",\"84314190\",\"LOS DEMAS CANGILONES PARA MAQ. O APARATOS DE LAS PTDAS. 84.26, 84.29 U 84.\",\"DIFERENCIALESDE MAQUINARIACATERPILLARUSADAS\",\"0.0\",\"100.0\",\"1050.0\",\"\",\"2.0\",\"KILOGRAMOS NETOS\",\"\",\"0\",\"0\",\"0\",\"CHACALLUTA\",\"OTROS PTOS. DE PERU\",\"CHL\",\"CHILE\",\"ROAD\",\"CYS\",\"\",\"COM.Y SERV.JOSE A.SAAL CH.\",\"\",\"\",\"\",\"202501\",\"CHL\",\"\",\"\"\n",
      "\"zdzj\",\"65369517c6854a39\",\"E\",\"D\",\"O\",\"\",\"2025-01-03\",\"CHL\",\"CHILE\",\"PER\",\"PERU\",\"\",\"\",\"\",\"\",\"\",\"\",\"84314190\",\"LOS DEMAS CANGILONES PARA MAQ. O APARATOS DE LAS PTDAS. 84.26, 84.29 U 84.\",\"TRANSMISIONDE MAQUINARIACATERPILLARSERIE NB1005291USADA\",\"0.0\",\"90.0\",\"1050.0\",\"\",\"1.0\",\"KILOGRAMOS NETOS\",\"\",\"0\",\"0\",\"0\",\"CHACALLUTA\",\"OTROS PTOS. DE PERU\",\"CHL\",\"CHILE\",\"ROAD\",\"CYS\",\"\",\"COM.Y SERV.JOSE A.SAAL CH.\",\"\",\"\",\"\",\"202501\",\"CHL\",\"\",\"\"\n",
      "\"zdzj\",\"96a14471225846b9\",\"E\",\"D\",\"O\",\"\",\"2025-01-06\",\"CHL\",\"CHILE\",\"PER\",\"PERU\",\"\",\"\",\"\",\"VULCO S.A.\",\"\",\"\",\"84139100\",\"PARTES DE BOMBAS PARA LIQUIDOS.\",\"ALMA TERMINADAVULCO-FPARA BOMBA CENTRIFUGADE USO EN LA MINERIASIN USO\",\"0.0\",\"7110.99\",\"11421.8\",\"\",\"331.0\",\"KILOGRAMOS NETOS\",\"\",\"0\",\"0\",\"0\",\"CHACALLUTA\",\"OTROS PTOS. DE PERU\",\"CHL\",\"CHILE\",\"ROAD\",\"EXW\",\"\",\"TRANSP. OCANA LTDA.\",\"\",\"\",\"\",\"202501\",\"CHL\",\"\",\"91619000\"\n",
      "\"zdzj\",\"409b29a68f6c4999\",\"E\",\"D\",\"O\",\"\",\"2025-01-06\",\"CHL\",\"CHILE\",\"PER\",\"PERU\",\"\",\"\",\"\",\"VULCO S.A.\",\"\",\"\",\"40012200\",\"CAUCHOS TECNICAMENTE ESPECIFICADOS (TSNR).\",\"BASE R55VULCO-FPARA DE BOMBA CENTRIFUGADEUSO EN PROCESOS MINEROSSIN  USO\",\"0.0\",\"25578.98\",\"11421.8\",\"\",\"7000.0\",\"KILOGRAMOS NETOS\",\"\",\"0\",\"0\",\"0\",\"CHACALLUTA\",\"OTROS PTOS. DE PERU\",\"CHL\",\"CHILE\",\"ROAD\",\"EXW\",\"\",\"TRANSP. OCANA LTDA.\",\"\",\"\",\"\",\"202501\",\"CHL\",\"\",\"91619000\"\n",
      "\"zdzj\",\"e515c143a2b246ec\",\"E\",\"D\",\"O\",\"\",\"2025-01-06\",\"CHL\",\"CHILE\",\"PER\",\"PERU\",\"\",\"\",\"\",\"VULCO S.A.\",\"\",\"\",\"84139100\",\"PARTES DE BOMBAS PARA LIQUIDOS.\",\"SPACER PLATEVULCO-FPARTE DE BOMBA CENTRIFUGADEUSO EN PROCESOS MINEROSSIN  USO\",\"0.0\",\"2769.07\",\"11421.8\",\"\",\"116.0\",\"KILOGRAMOS NETOS\",\"\",\"0\",\"0\",\"0\",\"CHACALLUTA\",\"OTROS PTOS. DE PERU\",\"CHL\",\"CHILE\",\"ROAD\",\"EXW\",\"\",\"TRANSP. OCANA LTDA.\",\"\",\"\",\"\",\"202501\",\"CHL\",\"\",\"91619000\"\n",
      "\"zdzj\",\"6aacd3eb344641e0\",\"E\",\"D\",\"O\",\"\",\"2025-01-06\",\"CHL\",\"CHILE\",\"PER\",\"PERU\",\"\",\"\",\"\",\"VULCO S.A.\",\"\",\"\",\"84139100\",\"PARTES DE BOMBAS PARA LIQUIDOS.\",\"ANILLO SELLOVULCO-FPARTE DE BOMBA CENTRIFUGADEUSO EN PROCESOS MINEROSSIN  USO\",\"0.0\",\"75.62\",\"11421.8\",\"\",\"1.8\",\"KILOGRAMOS NETOS\",\"\",\"0\",\"0\",\"0\",\"CHACALLUTA\",\"OTROS PTOS. DE PERU\",\"CHL\",\"CHILE\",\"ROAD\",\"EXW\",\"\",\"TRANSP. OCANA LTDA.\",\"\",\"\",\"\",\"202501\",\"CHL\",\"\",\"91619000\"\n",
      "\"zdzj\",\"e82738a3d6594338\",\"E\",\"D\",\"O\",\"\",\"2025-01-06\",\"CHL\",\"CHILE\",\"PER\",\"PERU\",\"\",\"\",\"\",\"VULCO S.A.\",\"\",\"\",\"84139100\",\"PARTES DE BOMBAS PARA LIQUIDOS.\",\"ANILLO ESPACIADORVULCO-FPARTE PARA BOMBA CENTRIFUGADE USO EN LA MINERIASINUSO\",\"0.0\",\"459.78\",\"11421.8\",\"\",\"3.39\",\"KILOGRAMOS NETOS\",\"\",\"0\",\"0\",\"0\",\"CHACALLUTA\",\"OTROS PTOS. DE PERU\",\"CHL\",\"CHILE\",\"ROAD\",\"EXW\",\"\",\"TRANSP. OCANA LTDA.\",\"\",\"\",\"\",\"202501\",\"CHL\",\"\",\"91619000\"\n",
      "\"zdzj\",\"a1d571f447d24754\",\"E\",\"D\",\"O\",\"\",\"2025-01-06\",\"CHL\",\"CHILE\",\"PER\",\"PERU\",\"\",\"\",\"\",\"VULCO S.A.\",\"\",\"\",\"84139100\",\"PARTES DE BOMBAS PARA LIQUIDOS.\",\"ANILLO DE FIJACIONVULCO-FPARTE PARA BOMBA CENTRIFUGADE USO EN LA MINERIASIN USO\",\"0.0\",\"331.24\",\"11421.8\",\"\",\"3.2\",\"KILOGRAMOS NETOS\",\"\",\"0\",\"0\",\"0\",\"CHACALLUTA\",\"OTROS PTOS. DE PERU\",\"CHL\",\"CHILE\",\"ROAD\",\"EXW\",\"\",\"TRANSP. OCANA LTDA.\",\"\",\"\",\"\",\"202501\",\"CHL\",\"\",\"91619000\"\n",
      "\"zdzj\",\"389b272ee7b74c60\",\"E\",\"D\",\"O\",\"\",\"2025-01-06\",\"CHL\",\"CHILE\",\"PER\",\"PERU\",\"\",\"\",\"\",\"VULCO S.A.\",\"\",\"\",\"73182900\",\"LOS DEMAS ARTICULOS SIN ROSCAR, DE FUNDICION, DE HIERRO O DE ACERO.\",\"GOLILLA SEGUROVULCO-FPARTE DE BOMBA CENTRIFUGADE USO EN PROCESOS MINEROSSIN USO\",\"0.0\",\"4.82\",\"11421.8\",\"\",\"0.1\",\"KILOGRAMOS NETOS\",\"\",\"0\",\"0\",\"0\",\"CHACALLUTA\",\"OTROS PTOS. DE PERU\",\"CHL\",\"CHILE\",\"ROAD\",\"EXW\",\"\",\"TRANSP. OCANA LTDA.\",\"\",\"\",\"\",\"202501\",\"CHL\",\"\",\"91619000\"\n"
     ]
    }
   ],
   "source": [
    "# !head -10 dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/IMPORT/202501-IMP-RAW.csv\n",
    "# !head -10 dataset/data_20250724/20251t3_csv_correct3/乌克兰/EXPORT/UA_EXPORT_202501.csv\n",
    "# !head -3 dataset/data_20250724/20251t3_csv_correct3/智利/IMPORT/202501-IMP.csv\n",
    "# !head -10 dataset/data_20250724/20251t3_csv_correct3/智利/EXPORT/202501-EXP.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "435e18c5-ea20-4517-866d-6ded3b4789ef",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "32c6b5b4-ae62-479a-8607-b98027901a5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pth2 = Path('dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202503-TRANS-RAW.csv')\n",
    "# pth2 = Path('dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/EXPORT/202503-EXP-RAW.csv')\n",
    "# pth2 = Path('dataset/data_20250724/20251t3_csv_correct3/坦桑尼亚/IMPORT/202503-IMP-RAW.csv')\n",
    "# pth2 = Path('dataset/data_20250724/20251t3_csv_correct3/乌克兰/EXPORT/UA_EXPORT_202501.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "693ae123-27a3-4277-91b0-3240f77749b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# df2 = pd.read_csv(pth2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "1fb10f21-c1c6-42a3-aa50-3d80a79f6a67",
   "metadata": {},
   "outputs": [],
   "source": [
    "# set(list(df2['importer_forwarderagent'].values))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "0dd604ed-02e6-4014-b1d8-2c126ccf1fb1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# set(list(df2['supplier_forwarderagent'].values))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "a7428728-27d1-42a5-a132-9a6ddbb39414",
   "metadata": {},
   "outputs": [],
   "source": [
    "# df2.loc[df2['importer_forwarderagent']!=0, ['dataid', 'importername', 'importer_forwarderagent']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "c8c21f08-db9c-444d-9a36-8cd9e8093653",
   "metadata": {},
   "outputs": [],
   "source": [
    "# df2.loc[df2['supplier_forwarderagent']!=0, ['dataid', 'suppliername', 'supplier_forwarderagent']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c00767f-7c36-46ca-b3ad-46f644d9c0da",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aa4f65e7-9855-4ed9-8cd1-3e25466c8d3b",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a99ea6ba-a9e5-438f-8a40-eae199fe67af",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "3e5294ce-9f93-48ed-85f7-197a6c1ab582",
   "metadata": {},
   "source": [
    "# 9 空值率与指标统计，并更新数字"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "26f913de-8799-42ed-b6fb-178a6a5263ba",
   "metadata": {},
   "source": [
    "## 9.2 总和指标计算"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "cea3c8e7-839f-4afc-a348-e75775489517",
   "metadata": {},
   "outputs": [],
   "source": [
    "# country_list = ['智利', '坦桑尼亚', '乌克兰']\n",
    "# # country_list = ['智利', '坦桑尼亚']\n",
    "# # country_list = ['智利']\n",
    "# csv_files_path = Path('dataset/data_20250724/20251t3_csv_final_tmp/files10')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "322d792b-eb24-47c3-99e0-ce9a24602ed1",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# start = time.time()\n",
    "\n",
    "# params_list = [([ctry], csv_files_path, country36_dict) for ctry in country_list]\n",
    "# results = []\n",
    "# with ThreadPoolExecutor(max_workers=10) as executor:\n",
    "#     # 提交多个任务\n",
    "#     futures = [\n",
    "#         executor.submit(lambda p: compute_stat4(*p), params)\n",
    "#         for params in params_list\n",
    "#     ]\n",
    "#     # 按完成顺序获取结果\n",
    "#     for future in as_completed(futures):\n",
    "#         res = future.result()\n",
    "#         logger.info(res.shape)\n",
    "#         results.append(res)\n",
    "\n",
    "# stat_rslt1 = pd.concat([x for x in results], axis=0, ignore_index=True)\n",
    "\n",
    "# end = time.time()\n",
    "# logger.info(f'get_type_enumvalue takes {(end-start)/60} minutes')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "13d2681a-6c8b-4c90-a2c2-af0d9e3f4344",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# 单线程\n",
    "# stat_rslt1 = compute_stat4(country_list, csv_files_path, country36_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "69fe5b33-ff94-4ab8-89f7-e1f0d8eba209",
   "metadata": {},
   "outputs": [],
   "source": [
    "# stat_rslt1.to_csv('result/statistic_result_sum_20250907.csv', sep=',', index=False, header=True, encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "9b9fb37b-188d-4d9e-b104-7fe6c7924830",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# stat_rslt1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "761d9e88-ff0f-4c68-9256-e5dc0ba8db00",
   "metadata": {},
   "outputs": [],
   "source": [
    "# future.result().shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1c699184-2580-4723-81b7-b380d2fb8460",
   "metadata": {},
   "source": [
    "## 9.3 空值率指标计算"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "15b35f4a-f893-43b7-82a2-4799092cf839",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# country_list = ['智利', '坦桑尼亚', '乌克兰']\n",
    "# # country_list = ['智利', '坦桑尼亚']\n",
    "# # country_list = ['坦桑尼亚']\n",
    "# # country_list = ['智利']\n",
    "# # csv_files_path = Path('dataset/data_20250724/20251t3_csv_final/files')\n",
    "# # csv_files_path = Path('dataset/data_20250724/20251t3_csv_final_tmp/final_files')\n",
    "# csv_files_path = Path('dataset/data_20250724/20251t3_csv_final_tmp/files10')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "ad3ecb31-8190-49e5-9c60-865eda31a9c9",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# stat_rslt2 = compute_stat5(country_list, csv_files_path, country36_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "dee180d9-2862-4db4-b03f-1b129bee5632",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# start = time.time()\n",
    "# params_list = [([ctry], csv_files_path, country36_dict) for ctry in country_list]\n",
    "\n",
    "# results = []\n",
    "# with ThreadPoolExecutor(max_workers=10) as executor:\n",
    "#     # 提交多个任务\n",
    "#     futures = [\n",
    "#         executor.submit(lambda p: compute_stat5(*p), params)\n",
    "#         for params in params_list\n",
    "#     ]\n",
    "#     # 按完成顺序获取结果\n",
    "#     for future in as_completed(futures):\n",
    "#         res = future.result()\n",
    "#         logger.info(res.shape)\n",
    "#         results.append(res)\n",
    "\n",
    "# stat_rslt2 = pd.concat([x for x in results], axis=0, ignore_index=True)\n",
    "# # stat_rslt2\n",
    "\n",
    "# end = time.time()\n",
    "# logger.info(f'get_type_enumvalue takes {(end-start)/60} minutes')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "3549e3f8-0421-4ff9-a1dd-0f78589e87be",
   "metadata": {},
   "outputs": [],
   "source": [
    "# stat_rslt2.to_csv('result/statistic_result_rus_20250907.csv', sep=',', index=False, header=True, encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "384e3d14-9211-41f7-a23c-1024c00e8792",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# stat_rslt2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca44b1d4-dd32-4216-8b92-59e093c9bdea",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52802231-a7f5-4877-9278-199939970fd9",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d342a52c-d4e7-491b-bbd7-82ebc1df4d32",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d598c13-4a73-4ce4-8e15-a56aeca99cd9",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "04b3395e-9c1c-4648-8b1b-fe38473c1ad8",
   "metadata": {},
   "source": [
    "# 10 获取HS码映射关系"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "cd501e74-e687-4031-9165-94b50edcd1f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# from_path = Path('dataset/data_20250724/20251t3_csv_correct1')\n",
    "# to_path = Path('result/20251t3_hs_mapping')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "52d02c0b-b4c9-4bc1-ae2a-0f807ad89cde",
   "metadata": {},
   "outputs": [],
   "source": [
    "# country_list = ['印度', '英国', '越南', '阿根廷', '埃塞俄比亚', '巴基斯坦', '俄罗斯陆运', '厄瓜多尔'\n",
    "#                 , '菲律宾', '哥伦比亚', '哥斯达黎加', '哈萨克斯坦', '加纳', '喀麦隆', '科特迪瓦', '肯尼亚'\n",
    "#                 , '莱索托', '马拉维', '美国', '孟加拉', '秘鲁', '秘鲁-海运', '秘鲁-空运', '墨西哥'\n",
    "#                 , '纳米比亚', '尼日利亚', '斯里兰卡', '坦桑尼亚', '乌干达', '乌克兰', '乌拉圭', '乌兹别克斯坦'\n",
    "#                 , '智利', '巴拉圭', '巴拿马', '博兹瓦纳']\n",
    "# # country_list = ['尼日利亚', '博兹瓦纳']\n",
    "# # country_list = ['阿根廷']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "51a075b0-d2e9-4b79-a93b-e45fc41e9a52",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# get_hs_mapping(from_path, to_path, country_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "d060e876-48fd-4612-9dca-3fdc2d228b07",
   "metadata": {},
   "outputs": [],
   "source": [
    "# start = time.time()\n",
    "# params_list = [(from_path, to_path, [ctry]) for ctry in country_list]\n",
    "\n",
    "# results = []\n",
    "# with ThreadPoolExecutor(max_workers=10) as executor:\n",
    "#     # 提交多个任务\n",
    "#     futures = [\n",
    "#         executor.submit(lambda p: get_hs_mapping(*p), params)\n",
    "#         for params in params_list\n",
    "#     ]\n",
    "#     # 按完成顺序获取结果\n",
    "#     for future in as_completed(futures):\n",
    "#         res = future.result()\n",
    "#         logger.info(res.shape)\n",
    "#         results.append(res)\n",
    "\n",
    "# end = time.time()\n",
    "# logger.info(f'get_type_enumvalue takes {(end-start)/60} minutes')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "ab74dae3-3681-43a3-9e01-310981f9c1a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# results"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6ab1d982-3dd8-4f50-bcb9-e8b4ef73027c",
   "metadata": {},
   "source": [
    "# 11 获取成交方式、运输方式、付款方式、"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "be3db185-dbf5-4568-aa60-8b1127378e00",
   "metadata": {},
   "outputs": [],
   "source": [
    "# transportterm    运输方式\n",
    "# tradeterm    成交方式\n",
    "# paymentterm    付款方式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "970730e1-92bc-40b1-9b4f-2d17552668af",
   "metadata": {},
   "outputs": [],
   "source": [
    "# from_path = Path('dataset/data_20250724/20251t3_csv_correct1')\n",
    "# to_path = Path('result')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "3e0843a1-855d-4358-bebe-4eb662be9a17",
   "metadata": {},
   "outputs": [],
   "source": [
    "# country_list = ['印度', '英国', '越南', '阿根廷', '埃塞俄比亚', '巴基斯坦', '俄罗斯陆运', '厄瓜多尔'\n",
    "#                 , '菲律宾', '哥伦比亚', '哥斯达黎加', '哈萨克斯坦', '加纳', '喀麦隆', '科特迪瓦', '肯尼亚'\n",
    "#                 , '莱索托', '马拉维', '美国', '孟加拉', '秘鲁', '秘鲁-海运', '秘鲁-空运', '墨西哥'\n",
    "#                 , '纳米比亚', '尼日利亚', '斯里兰卡', '坦桑尼亚', '乌干达', '乌克兰', '乌拉圭', '乌兹别克斯坦'\n",
    "#                 , '智利', '巴拉圭', '巴拿马', '博兹瓦纳']\n",
    "# # country_list = ['阿根廷']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "ed8b9225-2af5-4a0a-826e-ae6495f086ac",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# rslt = get_type_enumvalue(from_path, to_path, country_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "2847fd74-377e-4a6c-be00-14448f1ed937",
   "metadata": {},
   "outputs": [],
   "source": [
    "# start = time.time()\n",
    "# params_list = [(from_path, to_path, [ctry]) for ctry in country_list]\n",
    "\n",
    "# results = []\n",
    "# with ThreadPoolExecutor(max_workers=10) as executor:\n",
    "#     # 提交多个任务\n",
    "#     futures = [\n",
    "#         executor.submit(lambda p: get_type_enumvalue(*p), params)\n",
    "#         for params in params_list\n",
    "#     ]\n",
    "#     # 按完成顺序获取结果\n",
    "#     for future in as_completed(futures):\n",
    "#         res = future.result()\n",
    "#         logger.info(res.shape)\n",
    "#         results.append(res)\n",
    "\n",
    "# # rslt = pd.concat([future.result() for future in futures], axis=0, ignore_index=True)\n",
    "# end = time.time()\n",
    "# logger.info(f'get_type_enumvalue takes {(end-start)/60} minutes')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "7739976a-4002-4170-bee3-3f2050dc52d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "49bec0f3-f253-4585-9e3b-59e34ec76b8c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# rslt.to_dict('records')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "5dac3146-6b0c-4137-af4c-bbbc46fa2702",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# rslt"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b78e1669-6305-4fd5-8368-f558bea09d78",
   "metadata": {},
   "source": [
    "# 12 上传的csv文件格式处理"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6471fa36-2d86-4ae8-9dda-a14a686d71cd",
   "metadata": {},
   "source": [
    "## 12.1 去掉\"\\\\N\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3a8ac44e-faec-40ba-aa71-1f8da8826088",
   "metadata": {},
   "outputs": [],
   "source": [
    "# country_list = ['印度', '英国', '越南', '阿根廷', '埃塞俄比亚', '巴基斯坦', '俄罗斯陆运', '厄瓜多尔'\n",
    "#                 , '菲律宾', '哥伦比亚', '哥斯达黎加', '哈萨克斯坦', '加纳', '喀麦隆', '科特迪瓦', '肯尼亚'\n",
    "#                 , '莱索托', '马拉维', '美国', '孟加拉', '秘鲁', '秘鲁-海运', '秘鲁-空运', '墨西哥'\n",
    "#                 , '纳米比亚', '尼日利亚', '斯里兰卡', '坦桑尼亚', '乌干达', '乌克兰', '乌拉圭', '乌兹别克斯坦'\n",
    "#                 , '智利', '巴拉圭', '巴拿马', '博兹瓦纳']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e79f6b8d-369c-4949-a1c3-f4f1c206e87e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# # 获取文件目录，区分出口、进口数据\n",
    "# from_path = Path('dataset/data_20250724/20251t3_csv_final_tmp/final_files')\n",
    "\n",
    "# # file_tree = distinct_ie_data2(from_path, country_list)\n",
    "# to_path = Path('dataset/data_20250724/20251t3_csv_final/final_files')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39872c1b-733e-4e75-9e5f-bff06b031204",
   "metadata": {},
   "outputs": [],
   "source": [
    "# country_list = ['俄罗斯陆运']\n",
    "\n",
    "# # 获取文件目录，区分出口、进口数据\n",
    "# from_path = Path('dataset/data_20250724/20251t3_csv_final_tmp/RUS')\n",
    "\n",
    "# # file_tree = distinct_ie_data2(from_path, country_list)\n",
    "# to_path = Path('dataset/data_20250724/20251t3_csv_final/final_files')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "645ba27e-01cc-410f-bda5-0a12baa2ef14",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# start = time.time()\n",
    "# params_list = [([ctry], from_path, to_path, country36_dict) for ctry in country_list]\n",
    "\n",
    "# with ThreadPoolExecutor(max_workers=10) as executor:\n",
    "#     # 提交多个任务\n",
    "#     futures = [\n",
    "#         executor.submit(lambda p: replace_tab_to_norm(*p), params)\n",
    "#         for params in params_list\n",
    "#     ]\n",
    "#     # 按完成顺序获取结果\n",
    "#     for future in as_completed(futures):\n",
    "#         # logger.info(future.result().shape)\n",
    "#         pass\n",
    "\n",
    "\n",
    "# end = time.time()\n",
    "# logger.info(f'replace_tab_to_norm takes {(end-start)/60} minutes')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e67ef03c-0b6b-45c4-a6ac-bf9c59dc3531",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# params_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bc5deb04-4284-4bed-ad2c-e7c587fd62d2",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# replace_tab_to_norm(country_list, from_path, to_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a2bd569-e749-401e-ae3b-bc023b57d404",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aecbb604-0068-488b-9bc6-e27139355b9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !head -3 dataset/data_20250724/20251t3_csv_final/files/ARG_202501_EXP.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1e2b1227-9fe0-4681-94f1-a9250b5b5418",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c938f96b-ebd7-4e93-a274-0d860fc43c99",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "dc805f0f-e55c-49e6-8b9f-f9a91772308b",
   "metadata": {},
   "source": [
    "# 13 校验与更正"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f05bda2e-3ec0-433a-8143-878b7f93b56e",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# col_type = get_col_str_type(config.target_cols[0:-2])\n",
    "# # tab_tmp = pd.read_csv('dataset/data_20250724/UZB_202501_IMP.csv',encoding='utf-8',header=0,dtype=col_type)\n",
    "# column_names = config.target_cols[0:-2]\n",
    "# # dataset/data_20250724/20251t3_csv_final_tmp/final_files/BGD_202501_EXP.csv\n",
    "# # dataset/data_20250724/20251t3_csv_final_tmp/final_files/BWA_202503_EXP.csv\n",
    "\n",
    "# tab_tmp = pd.read_csv('dataset/data_20250724/20251t3_csv_final_tmp/final_files/BWA_202503_EXP.csv' \n",
    "#                       , header=None, names=column_names\n",
    "#                       , sep=','\n",
    "#                       , quotechar='\"'\n",
    "#                       , lineterminator='\\n'\n",
    "#                       , on_bad_lines='skip'\n",
    "#                       , encoding='utf-8',dtype=col_type)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1d9dac8e-026c-44ac-877b-d08da05d2cd4",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a8c7a3f9-0294-499a-9c41-82348700b305",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1bb17094-94a5-4e71-adc2-621d7b135de7",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "580353fc-fb87-4a1f-9f50-13e210dbe556",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2560e7a6-8f6e-4a96-978d-dea704194a2f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "25a80fff-3bd3-44f0-be12-b36efa246b6a",
   "metadata": {},
   "source": [
    "# end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "893235b3-693f-47df-bad1-6256b394e865",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_end=time.time()\n",
    "logger.info(f'{(all_end-all_start)/60} minutes')\n",
    "logger.info('All done!\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "409ea3bf-dca2-4bd1-b59d-ef2045e32deb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# str_type = dict()\n",
    "# for col in config.target_cols:\n",
    "#     str_type[col] = 'str'\n",
    "# str_type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c309bd2d-0ddb-4aeb-a788-672b2daea563",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
